int do_test(int origin_count, MPI_Datatype origin_type, int result_count, MPI_Datatype result_type, int target_count, MPI_Datatype target_type) { int errs = 0, ret, origin_type_size, result_type_size; ret = MPI_Put(origin_buf, origin_count, origin_type, 1, 0, target_count, target_type, win); if (ret) errs++; ret = MPI_Get(origin_buf, origin_count, origin_type, 1, 0, target_count, target_type, win); if (ret) errs++; ret = MPI_Accumulate(origin_buf, origin_count, origin_type, 1, 0, target_count, target_type, MPI_SUM, win); if (ret) errs++; ret = MPI_Get_accumulate(origin_buf, origin_count, origin_type, result_buf, result_count, result_type, 1, 0, target_count, target_type, MPI_SUM, win); if (ret) errs++; MPI_Type_size(origin_type, &origin_type_size); MPI_Type_size(result_type, &result_type_size); if (origin_count == 0 || origin_type_size == 0) { ret = MPI_Put(NULL, origin_count, origin_type, 1, 0, target_count, target_type, win); if (ret) errs++; ret = MPI_Get(NULL, origin_count, origin_type, 1, 0, target_count, target_type, win); if (ret) errs++; ret = MPI_Accumulate(NULL, origin_count, origin_type, 1, 0, target_count, target_type, MPI_SUM, win); if (ret) errs++; ret = MPI_Get_accumulate(NULL, origin_count, origin_type, result_buf, result_count, result_type, 1, 0, target_count, target_type, MPI_SUM, win); if (ret) errs++; if (result_count == 0 || result_type_size == 0) { ret = MPI_Get_accumulate(NULL, origin_count, origin_type, NULL, result_count, result_type, 1, 0, target_count, target_type, MPI_SUM, win); if (ret) errs++; } } return errs; }
/*Run ACC with Fence */ void run_acc_with_fence(int rank, WINDOW type) { int size, i; MPI_Aint disp = 0; MPI_Win win; for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : 1)) { allocate_memory(rank, sbuf_original, rbuf_original, &sbuf, &rbuf, &sbuf, size, type, &win); #if MPI_VERSION >= 3 if (type == WIN_DYNAMIC) { disp = disp_remote; } #endif if(size > LARGE_MESSAGE_SIZE) { loop = LOOP_LARGE; skip = SKIP_LARGE; } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); if(rank == 0) { for (i = 0; i < skip + loop; i++) { if (i == skip) { t_start = MPI_Wtime (); } MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); } t_end = MPI_Wtime (); } else { for (i = 0; i < skip + loop; i++) { MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Win_fence(0, win)); MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 0, disp, size, MPI_CHAR, MPI_SUM, win)); MPI_CHECK(MPI_Win_fence(0, win)); } } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); if (rank == 0) { fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / loop / 2); fflush(stdout); } free_memory (sbuf, rbuf, win, rank); } }
/** Lock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long lock_val, unlock_val, lock_out; int timeout = 1; MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); lock_val = rank+1; // Map into range 1..nproc unlock_val = -1 * (rank+1); /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); for (;;) { /* read mutex value */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window); MPI_Win_unlock(proc, hdl->window); ARMCII_Assert(lock_out > 0); ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks /* We are holding the mutex */ if (lock_out == rank+1) break; /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); /* Exponential backoff */ usleep(timeout + rand()%timeout); timeout = MIN(timeout*TIMEOUT_MUL, MAX_TIMEOUT); if (rand() % nproc == 0) // Chance to reset timeout timeout = 1; /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); } }
/** Lock a mutex. * * @param[in] hdl Handle to the mutex * @return MPI status */ int MCS_Mutex_lock(MCS_Mutex hdl) { int prev; /* This store is safe, since it cannot happen concurrently with a remote * write */ hdl->base[MCS_MTX_ELEM_DISP] = -1; MPI_Win_sync(hdl->window); MPI_Fetch_and_op(&shmem_world_rank, &prev, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP, MPI_REPLACE, hdl->window); MPI_Win_flush(hdl->tail_rank, hdl->window); /* If there was a previous tail, update their next pointer and wait for * notification. Otherwise, the mutex was successfully acquired. */ if (prev != -1) { /* Wait for notification */ MPI_Status status; MPI_Accumulate(&shmem_world_rank, 1, MPI_INT, prev, MCS_MTX_ELEM_DISP, 1, MPI_INT, MPI_REPLACE, hdl->window); MPI_Win_flush(prev, hdl->window); debug_print("%2d: LOCK - waiting for notification from %d\n", shmem_world_rank, prev); MPI_Recv(NULL, 0, MPI_BYTE, prev, MCS_MUTEX_TAG, hdl->comm, &status); } debug_print("%2d: LOCK - lock acquired\n", shmem_world_rank); return MPI_SUCCESS; }
/** One-sided accumulate operation with typed arguments. Source buffer must be private. * * @param[in] mreg Memory region * @param[in] src Address of source data * @param[in] src_count Number of elements of the given type at the source * @param[in] src_type MPI datatype of the source elements * @param[in] dst Address of destination buffer * @param[in] dst_count Number of elements of the given type at the destination * @param[in] src_type MPI datatype of the destination elements * @param[in] size Number of bytes to transfer * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, void *dst, int dst_count, MPI_Datatype dst_type, int proc) { int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); ARMCII_Assert(grp_proc >= 0); // Calculate displacement from beginning of the window if (dst == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED); ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window); return 0; }
/** Add rank to remote table * * @param[in] target_rank rank number * @param[in] rank rank number * @param[in] value value */ static void _add_remote_sync_images_table(const int target_rank, const int rank, const int value) { const int val = value; MPI_Accumulate(&val, 1, MPI_INT, target_rank, (MPI_Aint)&_sync_images_table_disp[rank], 1, MPI_INT, MPI_SUM, _xmp_mpi_onesided_win); XACC_DEBUG("accumulate(%d, %d) += %d", target_rank, rank, value); }
int main(int argc, char *argv[]) { int n, myid, numprocs, i, ierr; double PI25DT = 3.141592653589793238462643; double mypi, pi, h, sum, x; MPI_Win nwin, piwin; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&numprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myid); if (myid == 0) { MPI_Win_create(&n, sizeof(int), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &nwin); MPI_Win_create(&pi, sizeof(double), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &piwin); } else { MPI_Win_create(MPI_BOTTOM, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &nwin); MPI_Win_create(MPI_BOTTOM, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &piwin); } while (1) { if (myid == 0) { fprintf(stdout, "Enter the number of intervals: (0 quits) "); fflush(stdout); ierr=scanf("%d",&n); pi = 0.0; } MPI_Win_fence(0, nwin); if (myid != 0) MPI_Get(&n, 1, MPI_INT, 0, 0, 1, MPI_INT, nwin); MPI_Win_fence(0, nwin); if (n == 0) break; else { h = 1.0 / (double) n; sum = 0.0; for (i = myid + 1; i <= n; i += numprocs) { x = h * ((double)i - 0.5); sum += (4.0 / (1.0 + x*x)); } mypi = h * sum; MPI_Win_fence( 0, piwin); MPI_Accumulate(&mypi, 1, MPI_DOUBLE, 0, 0, 1, MPI_DOUBLE, MPI_SUM, piwin); MPI_Win_fence(0, piwin); if (myid == 0) { fprintf(stdout, "pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT)); fflush(stdout); } } } MPI_Win_free(&nwin); MPI_Win_free(&piwin); MPI_Finalize(); return 0; }
void DO_OP_LOOP(int dst, int iter) { int i, x; switch (OP_TYPE) { case OP_ACC: for (x = 0; x < iter; x++) { for (i = 0; i < NOP; i++) MPI_Accumulate(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE, MPI_SUM, win); MPI_Win_flush(dst, win); } break; case OP_PUT: for (x = 0; x < iter; x++) { for (i = 0; i < NOP; i++) MPI_Put(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE, win); MPI_Win_flush(dst, win); } break; case OP_GET: for (x = 0; x < iter; x++) { for (i = 0; i < NOP; i++) MPI_Get(&locbuf[0], OP_SIZE, MPI_DOUBLE, dst, 0, OP_SIZE, MPI_DOUBLE, win); MPI_Win_flush(dst, win); } break; } }
int main(int argc, char *argv[]) { int rank, nprocs, A[SIZE], B[SIZE], i; MPI_Comm CommDeuce; MPI_Win win; int errs = 0; MTest_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (nprocs < 2) { printf("Run this program with 2 or more processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Comm_split(MPI_COMM_WORLD, (rank < 2), rank, &CommDeuce); if (rank < 2) { if (rank == 0) { for (i = 0; i < SIZE; i++) A[i] = B[i] = i; } else { for (i = 0; i < SIZE; i++) { A[i] = (-3) * i; B[i] = (-4) * i; } } MPI_Win_create(B, SIZE * sizeof(int), sizeof(int), MPI_INFO_NULL, CommDeuce, &win); MPI_Win_fence(0, win); if (rank == 0) { for (i = 0; i < SIZE - 1; i++) MPI_Put(A + i, 1, MPI_INT, 1, i, 1, MPI_INT, win); } else { for (i = 0; i < SIZE - 1; i++) MPI_Get(A + i, 1, MPI_INT, 0, i, 1, MPI_INT, win); MPI_Accumulate(A + i, 1, MPI_INT, 0, i, 1, MPI_INT, MPI_SUM, win); } MPI_Win_fence(0, win); if (rank == 1) { for (i = 0; i < SIZE - 1; i++) { if (A[i] != B[i]) { SQUELCH(printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]);); errs++; } } }
/** Attempt to lock a mutex (non-blocking). * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives * @return 0 on success, non-zero on failure */ int ARMCIX_Trylock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long lock_val, unlock_val, lock_out; ARMCII_Assert(mutex >= 0); MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); lock_val = rank+1; unlock_val = -1 * (rank+1); /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); /* read mutex value */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window); MPI_Win_unlock(proc, hdl->window); ARMCII_Assert(lock_out > 0); ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks /* We are holding the mutex */ if (lock_out == rank+1) return 0; /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); return 1; }
int main(int argc, char **argv) { int i, rank, nproc; int errors = 0, all_errors = 0; int val = 0, one = 1; int iter; MPI_Aint *val_ptrs; MPI_Win dyn_win; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); iter = ITER_PER_RANK * nproc; val_ptrs = malloc(nproc * sizeof(MPI_Aint)); MPI_Get_address(&val, &val_ptrs[rank]); MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, val_ptrs, 1, MPI_AINT, MPI_COMM_WORLD); MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &dyn_win); MPI_Win_attach(dyn_win, &val, sizeof(int)); for (i = 0; i < iter; i++) { MPI_Win_fence(MPI_MODE_NOPRECEDE, dyn_win); MPI_Accumulate(&one, 1, MPI_INT, i % nproc, val_ptrs[i % nproc], 1, MPI_INT, MPI_SUM, dyn_win); MPI_Win_fence(MPI_MODE_NOSUCCEED, dyn_win); } MPI_Barrier(MPI_COMM_WORLD); /* Read and verify my data */ if (val != iter) { errors++; printf("%d -- Got %d, expected %d\n", rank, val, iter); } MPI_Win_detach(dyn_win, &val); MPI_Win_free(&dyn_win); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && all_errors == 0) printf(" No Errors\n"); free(val_ptrs); MPI_Finalize(); return 0; }
static int _ZMPI_Reduce_scatter_block_intsum_accumulate(const int *sendbuf, int nsendprocs, int *sendprocs, int *recvbuf, int recvcount, int nrecvprocs, int *recvprocs, MPI_Comm comm) { int i, j, size, rank; MPI_Win win; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); for (i = 0; i < recvcount; ++i) recvbuf[i] = DEFAULT_INT; MPI_Win_create(recvbuf, recvcount * sizeof(int), sizeof(int), MPI_INFO_NULL, comm, &win); MPI_Win_fence(MPI_MODE_NOSTORE|MPI_MODE_NOPRECEDE, win); if (nsendprocs >= 0) { for (j = 0; j < nsendprocs; ++j) { for (i = 0; i < recvcount; ++i) if (sendbuf[sendprocs[j] * recvcount + i] != DEFAULT_INT) break; if (i < recvcount) MPI_Accumulate((void *) &sendbuf[sendprocs[j] * recvcount], recvcount, MPI_INT, sendprocs[j], 0, recvcount, MPI_INT, MPI_SUM, win); } } else { for (j = 0; j < size; ++j) { for (i = 0; i < recvcount; ++i) if (sendbuf[j * recvcount + i] != DEFAULT_INT) break; if (i < recvcount) MPI_Accumulate((void *) &sendbuf[j * recvcount], recvcount, MPI_INT, j, 0, recvcount, MPI_INT, MPI_SUM, win); } } MPI_Win_fence(MPI_MODE_NOPUT|MPI_MODE_NOSUCCEED, win); MPI_Win_free(&win); return MPI_SUCCESS; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double *buffer, *src_buf; MPI_Win buf_win; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); bufsize = XDIM * YDIM * sizeof(double); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &buffer); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf); for (i = 0; i < XDIM * YDIM; i++) { *(buffer + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } MPI_Win_create(buffer, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win); peer = (rank + 1) % nranks; for (i = 0; i < ITERATIONS; i++) { MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win); for (j = 0; j < YDIM; j++) { MPI_Accumulate(src_buf + j * XDIM, XDIM, MPI_DOUBLE, peer, j * XDIM * sizeof(double), XDIM, MPI_DOUBLE, MPI_SUM, buf_win); } MPI_Win_unlock(peer, buf_win); } MPI_Barrier(MPI_COMM_WORLD); MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer + i + j * XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank + nranks - 1) % nranks)) * (ITERATIONS); if (fabs(actual - expected) > 1.0e-10) { SQUELCH(printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual);); errors++; fflush(stdout); } }
int MPIX_Accumulate_x(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win) { int rc = MPI_SUCCESS; if (likely (origin_count <= bigmpi_int_max && target_count <= bigmpi_int_max)) { rc = MPI_Accumulate(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, op, win); } else { MPI_Datatype neworigin_datatype, newtarget_datatype; MPIX_Type_contiguous_x(origin_count, origin_datatype, &neworigin_datatype); MPIX_Type_contiguous_x(target_count, target_datatype, &newtarget_datatype); MPI_Type_commit(&neworigin_datatype); MPI_Type_commit(&newtarget_datatype); rc = MPI_Accumulate(origin_addr, 1, neworigin_datatype, target_rank, target_disp, 1, newtarget_datatype, op, win); MPI_Type_free(&neworigin_datatype); MPI_Type_free(&newtarget_datatype); } return rc; }
void RunAccFence(MPI_Win win, int destRank, int cnt, int sz) { int k, i, j, one = 1; for (k = 0; k < MAX_RUNS; k++) { MPI_Barrier(MPI_COMM_WORLD); MPI_Win_fence(0, win); j = 0; for (i = 0; i < cnt; i++) { MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win); j += sz; } MPI_Win_fence(0, win); } }
void RunAccLock(MPI_Win win, int destRank, int cnt, int sz) { int k, i, j, one = 1; for (k = 0; k < MAX_RUNS; k++) { MPI_Barrier(MPI_COMM_WORLD); MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win); j = 0; for (i = 0; i < cnt; i++) { MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win); j += sz; } MPI_Win_unlock(destRank, win); } }
JNIEXPORT void JNICALL Java_mpi_Win_accumulate( JNIEnv *env, jobject jthis, jlong win, jobject origin, jint orgCount, jlong orgType, jint targetRank, jint targetDisp, jint targetCount, jlong targetType, jobject jOp, jlong hOp, jint baseType) { void *orgPtr = (*env)->GetDirectBufferAddress(env, origin); MPI_Op op = ompi_java_op_getHandle(env, jOp, hOp, baseType); int rc = MPI_Accumulate(orgPtr, orgCount, (MPI_Datatype)orgType, targetRank, (MPI_Aint)targetDisp, targetCount, (MPI_Datatype)targetType, op, (MPI_Win)win); ompi_java_exceptionCheck(env, rc); }
void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz, MPI_Group exposureGroup, MPI_Group accessGroup) { int k, i, j, one = 1; for (k = 0; k < MAX_RUNS; k++) { MPI_Barrier(MPI_COMM_WORLD); MPI_Win_post(exposureGroup, 0, win); MPI_Win_start(accessGroup, 0, win); j = 0; for (i = 0; i < cnt; i++) { MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win); j += sz; } MPI_Win_complete(win); MPI_Win_wait(win); } }
/** Unlock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long unlock_val; ARMCII_Assert(mutex >= 0); MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); unlock_val = -1 * (rank+1); /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); }
static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) { PetscErrorCode ierr; PetscInt i,nranks; const PetscMPIInt *ranks; const MPI_Datatype *mine,*remote; MPI_Win win; PetscFunctionBegin; ierr = PetscSFGetRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr); ierr = PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);CHKERRQ(ierr); ierr = PetscSFWindowOpTranslate(&op);CHKERRQ(ierr); ierr = PetscSFGetWindow(sf,unit,rootdata,PETSC_FALSE,0,0,0,&win);CHKERRQ(ierr); for (i=0; i<sf->nranks; i++) { ierr = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,sf->ranks[i],0,win);CHKERRQ(ierr); ierr = MPI_Get(leafupdate,1,mine[i],ranks[i],0,1,remote[i],win);CHKERRQ(ierr); ierr = MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],0,1,remote[i],op,win);CHKERRQ(ierr); ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr); } PetscFunctionReturn(0); }
PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op) { PetscSF_Window *w = (PetscSF_Window*)sf->data; PetscErrorCode ierr; PetscInt i,nranks; const PetscMPIInt *ranks; const MPI_Datatype *mine,*remote; MPI_Win win; PetscFunctionBegin; ierr = PetscSFGetRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr); ierr = PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);CHKERRQ(ierr); ierr = PetscSFWindowOpTranslate(&op);CHKERRQ(ierr); ierr = PetscSFGetWindow(sf,unit,rootdata,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&win);CHKERRQ(ierr); for (i=0; i<nranks; i++) { if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win);CHKERRQ(ierr);} ierr = MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],0,1,remote[i],op,win);CHKERRQ(ierr); if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr);} } PetscFunctionReturn(0); }
void Get_nextval_tree(MPI_Win win, int *get_array, MPI_Datatype get_type, MPI_Datatype acc_type, int nlevels, int *value) { int *one, i; one = (int *) malloc(nlevels*sizeof(int)); for (i=0; i<nlevels; i++) one[i] = 1; MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win); MPI_Accumulate(one, nlevels, MPI_INT, 0, 0, 1, acc_type, MPI_SUM, win); MPI_Get(get_array, nlevels, MPI_INT, 0, 0, 1, get_type, win); MPI_Win_unlock(0, win); *value = localvalue; for (i=0; i<nlevels; i++) *value = *value + get_array[i]; localvalue++; free(one); }
/*Run ACC with Lock/unlock */ void run_acc_with_lock(int rank, WINDOW type) { int size, i; MPI_Aint disp = 0; MPI_Win win; for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : 1)) { allocate_memory(rank, sbuf_original, rbuf_original, &sbuf, &rbuf, &sbuf, size, type, &win); #if MPI_VERSION >= 3 if (type == WIN_DYNAMIC) { disp = disp_remote; } #endif if(size > LARGE_MESSAGE_SIZE) { loop = LOOP_LARGE; skip = SKIP_LARGE; } if(rank == 0) { for (i = 0; i < skip + loop; i++) { if (i == skip) { t_start = MPI_Wtime (); } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); MPI_CHECK(MPI_Accumulate(sbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); MPI_CHECK(MPI_Win_unlock(1, win)); } t_end = MPI_Wtime (); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); print_latency(rank, size); free_memory (sbuf, rbuf, win, rank); } }
void ompi_accumulate_f(char *origin_addr, MPI_Fint *origin_count, MPI_Fint *origin_datatype, MPI_Fint *target_rank, MPI_Aint *target_disp, MPI_Fint *target_count, MPI_Fint *target_datatype, MPI_Fint *op, MPI_Fint *win, MPI_Fint *ierr) { int ierr_c; MPI_Datatype c_origin_datatype = MPI_Type_f2c(*origin_datatype); MPI_Datatype c_target_datatype = MPI_Type_f2c(*target_datatype); MPI_Win c_win = MPI_Win_f2c(*win); MPI_Op c_op = MPI_Op_f2c(*op); ierr_c = MPI_Accumulate(OMPI_F2C_BOTTOM(origin_addr), OMPI_FINT_2_INT(*origin_count), c_origin_datatype, OMPI_FINT_2_INT(*target_rank), *target_disp, OMPI_FINT_2_INT(*target_count), c_target_datatype, c_op, c_win); if (NULL != ierr) *ierr = OMPI_INT_2_FINT(ierr_c); }
void oshmpi_lock(long * lockp) { MPI_Status status; oshmpi_lock_t *lock = (oshmpi_lock_t *) lockp; /* Replace myself with the last tail */ MPI_Fetch_and_op (&shmem_world_rank, &(lock->prev), MPI_INT, TAIL, TAIL_DISP, MPI_REPLACE, oshmpi_lock_win); MPI_Win_flush (TAIL, oshmpi_lock_win); /* Previous proc holding lock will eventually notify */ if (lock->prev != -1) { /* Send my shmem_world_rank to previous proc's next */ MPI_Accumulate (&shmem_world_rank, 1, MPI_INT, lock->prev, NEXT_DISP, 1, MPI_INT, MPI_REPLACE, oshmpi_lock_win); MPI_Win_flush (lock->prev, oshmpi_lock_win); MPI_Probe (lock->prev, MPI_ANY_TAG, MPI_COMM_WORLD, &status); } /* Hold lock */ oshmpi_lock_base[LOCK_DISP] = 1; MPI_Win_sync (oshmpi_lock_win); return; }
int main(int argc, char *argv[]) { int rank, nprocs, i; int *A, *B; MPI_Win win; MPI_Init(&argc,&argv); Test_Init_No_File(); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (nprocs != 2) { printf("Run this program with 2 processes\n"); MPI_Abort(MPI_COMM_WORLD,1); } i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &A); if (i) { printf("Can't allocate memory in test program\n"); MPI_Abort(MPI_COMM_WORLD, 1); } i = MPI_Alloc_mem(SIZE * sizeof(int), MPI_INFO_NULL, &B); if (i) { printf("Can't allocate memory in test program\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (rank == 0) { for (i=0; i<SIZE; i++) A[i] = B[i] = i; } else { for (i=0; i<SIZE; i++) { A[i] = (-3)*i; B[i] = (-4)*i; } } MPI_Win_create(B, SIZE*sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win); MPI_Win_fence(0, win); if (rank == 0) { for (i=0; i<SIZE-1; i++) MPI_Put(A+i, 1, MPI_INT, 1, i, 1, MPI_INT, win); } else { for (i=0; i<SIZE-1; i++) MPI_Get(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, win); MPI_Accumulate(A+i, 1, MPI_INT, 0, i, 1, MPI_INT, MPI_SUM, win); } MPI_Win_fence(0, win); if (rank == 1) { for (i=0; i<SIZE-1; i++) { if (A[i] != B[i]) { printf("Put/Get Error: A[i]=%d, B[i]=%d\n", A[i], B[i]); Test_Failed(NULL); } } } else { if (B[SIZE-1] != SIZE - 1 - 3*(SIZE-1)) { printf("Accumulate Error: B[SIZE-1] is %d, should be %d\n", B[SIZE-1], SIZE - 1 - 3*(SIZE-1)); Test_Failed(NULL); } } MPI_Win_free(&win); MPI_Free_mem(A); MPI_Free_mem(B); Test_Waitforall(); Test_Global_Summary(); MPI_Finalize(); return 0; }
/* tests passive target RMA on 2 processes. tests the lock-single_op-unlock optimization for less common cases: origin datatype derived, target datatype predefined */ int main(int argc, char *argv[]) { int wrank, nprocs, *srcbuf, *rmabuf, i; int memsize; MPI_Datatype vectype; MPI_Win win; int errs = 0; MTest_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&wrank); if (nprocs < 2) { printf("Run this program with 2 or more processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } memsize = 10 * 4 * nprocs; /* Create and initialize data areas */ srcbuf = (int *)malloc( sizeof(int) * memsize ); MPI_Alloc_mem( sizeof(int) * memsize, MPI_INFO_NULL, &rmabuf ); if (!srcbuf || !rmabuf) { printf( "Unable to allocate srcbuf and rmabuf of size %d\n", memsize ); MPI_Abort( MPI_COMM_WORLD, 1 ); } for (i=0; i<memsize; i++) { rmabuf[i] = -i; srcbuf[i] = i; } MPI_Win_create( rmabuf, memsize*sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win ); /* Vector of 10 elements, separated by 4 */ MPI_Type_vector( 10, 1, 4, MPI_INT, &vectype ); MPI_Type_commit( &vectype ); /* Accumulate with a derived origin type and target predefined type*/ if (wrank == 0) { MPI_Barrier( MPI_COMM_WORLD ); MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win ); for (i=0; i<10; i++) { if (rmabuf[i] != -i + 4*i) { errs++; printf( "Acc: expected rmabuf[%d] = %d but saw %d\n", i, -i + 4*i, rmabuf[i] ); } rmabuf[i] = -i; } for (i=10; i<memsize; i++) { if (rmabuf[i] != -i) { errs++; printf( "Acc: expected rmabuf[%d] = %d but saw %d\n", i, -i, rmabuf[i] ); rmabuf[i] = -i; } } MPI_Win_unlock( 0, win ); } else if (wrank == 1) { MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win ); MPI_Accumulate( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, MPI_SUM, win ); MPI_Win_unlock( 0, win ); MPI_Barrier( MPI_COMM_WORLD ); } else { MPI_Barrier( MPI_COMM_WORLD ); } MPI_Barrier(MPI_COMM_WORLD); /* Put with a derived origin type and target predefined type*/ if (wrank == 0) { MPI_Barrier( MPI_COMM_WORLD ); MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win ); for (i=0; i<10; i++) { if (rmabuf[i] != 4*i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, 4*i, rmabuf[i] ); } rmabuf[i] = -i; } for (i=10; i<memsize; i++) { if (rmabuf[i] != -i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, -i, rmabuf[i] ); rmabuf[i] = -i; } } MPI_Win_unlock( 0, win ); } else if (wrank == 1) { MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win ); MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win ); MPI_Win_unlock( 0, win ); MPI_Barrier( MPI_COMM_WORLD ); } else { MPI_Barrier( MPI_COMM_WORLD ); } MPI_Barrier(MPI_COMM_WORLD); /* Put with a derived origin type and target predefined type, with a get (see the move-to-end optimization) */ if (wrank == 0) { MPI_Barrier( MPI_COMM_WORLD ); MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win ); for (i=0; i<10; i++) { if (rmabuf[i] != 4*i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, 4*i, rmabuf[i] ); } rmabuf[i] = -i; } for (i=10; i<memsize; i++) { if (rmabuf[i] != -i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, -i, rmabuf[i] ); rmabuf[i] = -i; } } MPI_Win_unlock( 0, win ); } else if (wrank == 1) { int val; MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win ); MPI_Get( &val, 1, MPI_INT, 0, 10, 1, MPI_INT, win ); MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win ); MPI_Win_unlock( 0, win ); MPI_Barrier( MPI_COMM_WORLD ); if (val != -10) { errs++; printf( "Get: Expected -10, got %d\n", val ); } } else { MPI_Barrier( MPI_COMM_WORLD ); } MPI_Barrier(MPI_COMM_WORLD); /* Put with a derived origin type and target predefined type, with a get already at the end (see the move-to-end optimization) */ if (wrank == 0) { MPI_Barrier( MPI_COMM_WORLD ); MPI_Win_lock( MPI_LOCK_EXCLUSIVE, 0, 0, win ); for (i=0; i<10; i++) { if (rmabuf[i] != 4*i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, 4*i, rmabuf[i] ); } rmabuf[i] = -i; } for (i=10; i<memsize; i++) { if (rmabuf[i] != -i) { errs++; printf( "Put: expected rmabuf[%d] = %d but saw %d\n", i, -i, rmabuf[i] ); rmabuf[i] = -i; } } MPI_Win_unlock( 0, win ); } else if (wrank == 1) { int val; MPI_Win_lock( MPI_LOCK_SHARED, 0, 0, win ); MPI_Put( srcbuf, 1, vectype, 0, 0, 10, MPI_INT, win ); MPI_Get( &val, 1, MPI_INT, 0, 10, 1, MPI_INT, win ); MPI_Win_unlock( 0, win ); MPI_Barrier( MPI_COMM_WORLD ); if (val != -10) { errs++; printf( "Get: Expected -10, got %d\n", val ); } } else { MPI_Barrier( MPI_COMM_WORLD ); } MPI_Win_free( &win ); MPI_Free_mem( rmabuf ); free( srcbuf ); MPI_Type_free( &vectype ); MTest_Finalize(errs); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { MPI_Win win; int errors = 0; int rank, nproc, i; double *orig_buf; double *tar_buf; MPI_Datatype vector_dtp; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Alloc_mem(sizeof(double) * DATA_SIZE, MPI_INFO_NULL, &orig_buf); MPI_Alloc_mem(sizeof(double) * DATA_SIZE, MPI_INFO_NULL, &tar_buf); for (i = 0; i < DATA_SIZE; i++) { orig_buf[i] = 1.0; tar_buf[i] = 0.5; } MPI_Type_vector(5 /* count */ , 3 /* blocklength */ , 5 /* stride */ , MPI_DOUBLE, &vector_dtp); MPI_Type_commit(&vector_dtp); MPI_Win_create(tar_buf, sizeof(double) * DATA_SIZE, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win); if (rank == 0) { MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win); MPI_Accumulate(orig_buf, 1, vector_dtp, 1, 0, 1, vector_dtp, MPI_SUM, win); MPI_Win_unlock(1, win); } MPI_Win_fence(0, win); if (rank == 1) { for (i = 0; i < DATA_SIZE; i++) { if (i % 5 < 3) { if (tar_buf[i] != 1.5) { printf("tar_buf[i] = %f (expected 1.5)\n", tar_buf[i]); errors++; } } else { if (tar_buf[i] != 0.5) { printf("tar_buf[i] = %f (expected 0.5)\n", tar_buf[i]); errors++; } } } } MPI_Type_free(&vector_dtp); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win); MPI_Accumulate(orig_buf, DATA_SIZE, MPI_DOUBLE, 1, 0, DATA_SIZE, MPI_DOUBLE, MPI_SUM, win); MPI_Win_unlock(1, win); } MPI_Win_fence(0, win); if (rank == 1) { for (i = 0; i < DATA_SIZE; i++) { if (i % 5 < 3) { if (tar_buf[i] != 2.5) { printf("tar_buf[i] = %f (expected 2.5)\n", tar_buf[i]); errors++; } } else { if (tar_buf[i] != 1.5) { printf("tar_buf[i] = %f (expected 1.5)\n", tar_buf[i]); errors++; } } } } MPI_Win_free(&win); MPI_Free_mem(orig_buf); MPI_Free_mem(tar_buf); if (rank == 1) { if (errors == 0) printf(" No Errors\n"); } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int rank, nproc; int i; MPI_Win win; int *tar_buf = NULL; int *orig_buf = NULL; MPI_Datatype derived_dtp; int errors = 0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (nproc < 3) { fprintf(stderr, "Run this program with at least 3 processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &orig_buf); MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &tar_buf); for (i = 0; i < DATA_SIZE; i++) { orig_buf[i] = 1; tar_buf[i] = 0; } MPI_Type_vector(COUNT, BLOCKLENGTH - 1, STRIDE, MPI_INT, &derived_dtp); MPI_Type_commit(&derived_dtp); MPI_Win_create(tar_buf, sizeof(int) * DATA_SIZE, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win); /***** test between rank 0 and rank 1 *****/ if (rank == 1) { MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win); for (i = 0; i < OPS_NUM; i++) { MPI_Accumulate(orig_buf, 1, derived_dtp, 0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win); MPI_Win_flush_local(0, win); } MPI_Win_unlock(0, win); } MPI_Barrier(MPI_COMM_WORLD); /* check results */ if (rank == 0) { for (i = 0; i < DATA_SIZE - COUNT; i++) { if (tar_buf[i] != OPS_NUM) { printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM); errors++; } } } for (i = 0; i < DATA_SIZE; i++) { tar_buf[i] = 0; } MPI_Barrier(MPI_COMM_WORLD); /***** test between rank 0 and rank 2 *****/ if (rank == 2) { MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win); for (i = 0; i < OPS_NUM; i++) { MPI_Accumulate(orig_buf, 1, derived_dtp, 0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win); MPI_Win_flush_local(0, win); } MPI_Win_unlock(0, win); } MPI_Barrier(MPI_COMM_WORLD); /* check results */ if (rank == 0) { for (i = 0; i < DATA_SIZE - COUNT; i++) { if (tar_buf[i] != OPS_NUM) { printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM); errors++; } } if (errors == 0) printf(" No Errors\n"); } MPI_Win_free(&win); MPI_Type_free(&derived_dtp); MPI_Free_mem(orig_buf); MPI_Free_mem(tar_buf); MPI_Finalize(); return 0; }
void test_dim(int ndim) { int dim,elems; int i,j, proc; /* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/ double *b; double *a, *a1, *a2, *c; int ridx; MPI_Datatype typeA, typeB; int rstrideB[MAXDIMS]; int rcount[MAXDIMS]; int pidx1 = -1, pidx2 = -1, pidx3 = -1; elems = 1; strideA[0]=sizeof(double); strideB[0]=sizeof(double); for(i=0;i<ndim;i++){ strideA[i] *= dimsA[i]; strideB[i] *= dimsB[i]; if(i<ndim-1){ strideA[i+1] = strideA[i]; strideB[i+1] = strideB[i]; } elems *= dimsA[i]; } /* create shared and local arrays */ create_safe_array((void**)&b, sizeof(double),ndim,dimsB); a1 = (double *)malloc(sizeof(double)*elems); assert(a1); a2 = (double *)malloc(sizeof(double)*elems); assert(a2); c = (double *)malloc(sizeof(double)*elems); assert(c); init(a1, ndim, elems, dimsA, me!=0, 0); init(a2, ndim, elems, dimsA, me!=0, 1); if(me==0){ printf("--------array[%d",dimsA[0]); for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]); printf("]--------\n"); } sleep(1); MP_BARRIER(); for(i=0;i<LOOP;i++){ int idx1, idx2, idx3, ridx; MPI_Request request; if (i%2) { a = a2; } else { a = a1; } get_range(ndim, dimsA, loA, hiA); new_range(ndim, dimsB, loA, hiA, loB, hiB); new_range(ndim, dimsA, loA, hiA, loC, hiC); proc=nproc-1-me; if(me==0){ print_range("local",ndim,loA, hiA,"-> "); print_range("remote",ndim,loB, hiB,"-> "); print_range("local",ndim,loC, hiC,"\n"); } idx1 = Index(ndim, loA, dimsA); idx2 = Index(ndim, loB, dimsB); idx3 = Index(ndim, loC, dimsA); MPI_Sendrecv(&idx2, 1, MPI_INT, proc, 666, &ridx, 1, MPI_INT, proc, 666, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for(j=0;j<ndim;j++)count[j]=hiA[j]-loA[j]+1; count[0] *= sizeof(double); /* convert range to bytes at stride level zero */ Strided_to_dtype(strideA, count, ndim-1, MPI_BYTE, &typeA); MPI_Type_commit(&typeA); Strided_to_dtype(strideB, count, ndim-1, MPI_BYTE, &typeB); MPI_Type_commit(&typeB); MPI_Accumulate(a + idx1, 1, typeA, proc, (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_REPLACE, win); MP_FLUSH(proc); /* note that we do not need Fence here since * consectutive operations targeting the same process are ordered */ MPI_Get_accumulate(NULL, 0, MPI_BYTE, c + idx3, 1, typeA, proc, (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_NO_OP, win); MP_FLUSH(proc); compare_patches(0., ndim, a+idx1, loA, hiA, dimsA, c+idx3, loC, hiC, dimsA); pidx1 = idx1; pidx2 = idx2; pidx3 = idx3; MPI_Type_free(&typeA); MPI_Type_free(&typeB); } free(c); destroy_safe_array(); free(a); }