Example #1
int main(int argc, char **argv)
    int rank, size;
    MPI_Win win = MPI_WIN_NULL;
    int *baseptr = NULL;
    int errs = 0, mpi_errno = MPI_SUCCESS;
    int val1 = 0, val2 = 0, flag = 0;
    MPI_Request reqs[2];
    MPI_Status stats[2];

    MTest_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);


    MPI_Win_allocate(2 * sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &baseptr, &win);

    /* Initialize window buffer */
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
    baseptr[0] = 1;
    baseptr[1] = 2;
    MPI_Win_unlock(rank, win);
    /* Issue request-based get with testall. */
    MPI_Win_lock_all(0, win);
    MPI_Rget(&val1, 1, MPI_INT, 0, 0, 1, MPI_INT, win, &reqs[0]);
    MPI_Rget(&val2, 1, MPI_INT, 0, 1, 1, MPI_INT, win, &reqs[1]);

    do {
        mpi_errno = MPI_Testall(2, reqs, &flag, stats);
    } while (flag == 0);

    /* Check get value. */
    if (val1 != 1 || val2 != 2) {
        printf("%d - Got val1 = %d, val2 = %d, expected 1, 2\n", rank, val1, val2);

    /* Check return error code. */
    if (mpi_errno != MPI_SUCCESS) {
        printf("%d - Got return errno %d, expected MPI_SUCCESS(%d)\n",
               rank, mpi_errno, MPI_SUCCESS);




    return errs != 0;
Example #2
int MPIX_Rget_x(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
                int target_rank, MPI_Aint target_disp, MPI_Count target_count, MPI_Datatype target_datatype,
                MPI_Win win, MPI_Request *request)
    int rc = MPI_SUCCESS;

    if (likely (origin_count <= bigmpi_int_max && target_count <= bigmpi_int_max)) {
        rc = MPI_Rget(origin_addr, origin_count, origin_datatype,
                     target_rank, target_disp, target_count, target_datatype, win, request);
    } else {
        MPI_Datatype neworigin_datatype, newtarget_datatype;
        MPIX_Type_contiguous_x(origin_count, origin_datatype, &neworigin_datatype);
        MPIX_Type_contiguous_x(target_count, target_datatype, &newtarget_datatype);
        rc = MPI_Rget(origin_addr, 1, neworigin_datatype,
                     target_rank, target_disp, 1, newtarget_datatype, win, request);
    return rc;
Example #3
JNIEXPORT jlong JNICALL Java_mpi_Win_rGet(JNIEnv *env, jobject jthis, jlong win,
    jobject origin, jint orgCount, jlong orgType, jint targetRank, jint targetDisp,
    jint targetCount, jlong targetType, jint base)
    void *orgPtr = (*env)->GetDirectBufferAddress(env, origin);
    MPI_Request request;
    int rc = MPI_Rget(orgPtr, orgCount, (MPI_Datatype)orgType,
                      targetRank, (MPI_Aint)targetDisp, targetCount,
                      (MPI_Datatype)targetType, (MPI_Win)win, &request);
    ompi_java_exceptionCheck(env, rc);
    return (jlong)request;
Example #4
int main( int argc, char *argv[] )
    int rank, nproc, i;
    int errors = 0, all_errors = 0;
    int *buf;
    MPI_Win window;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    if (nproc < 2) {
        if (rank == 0) printf("Error: must be run with two or more processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);

    /** Create using MPI_Win_create() **/

    if (rank == 0) {
      MPI_Alloc_mem(4*sizeof(int), MPI_INFO_NULL, &buf);
      *buf = nproc-1;
    } else
      buf = NULL;

    MPI_Win_create(buf, 4*sizeof(int)*(rank == 0), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &window);

    /* PROC_NULL Communication */
        MPI_Request pn_req[4];
        int val[4], res;

        MPI_Win_lock_all(0, window);

        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, MPI_PROC_NULL, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
        MPI_Rget(&val[1], 1, MPI_INT, MPI_PROC_NULL, 1, 1, MPI_INT, window, &pn_req[1]);
        MPI_Rput(&val[2], 1, MPI_INT, MPI_PROC_NULL, 2, 1, MPI_INT, window, &pn_req[2]);
        MPI_Raccumulate(&val[3], 1, MPI_INT, MPI_PROC_NULL, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);

        assert(pn_req[0] != MPI_REQUEST_NULL);
        assert(pn_req[1] != MPI_REQUEST_NULL);
        assert(pn_req[2] != MPI_REQUEST_NULL);
        assert(pn_req[3] != MPI_REQUEST_NULL);


        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);


    MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, window);

    /* GET-ACC: Test third-party communication, through rank 0. */
    for (i = 0; i < ITER; i++) {
        MPI_Request gacc_req;
        int val = -1, exp = -1;

        /* Processes form a ring.  Process 0 starts first, then passes a token
         * to the right.  Each process, in turn, performs third-party
         * communication via process 0's window. */
        if (rank > 0) {
            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

        MPI_Rget_accumulate(&rank, 1, MPI_INT, &val, 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_REPLACE, window, &gacc_req);
        assert(gacc_req != MPI_REQUEST_NULL);
        MPI_Wait(&gacc_req, MPI_STATUS_IGNORE);

        MPI_Win_flush(0, window);

        exp = (rank + nproc-1) % nproc;

        if (val != exp) {
            printf("%d - Got %d, expected %d\n", rank, val, exp);

        if (rank < nproc-1) {
            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);



    if (rank == 0) *buf = nproc-1;

    /* GET+PUT: Test third-party communication, through rank 0. */
    for (i = 0; i < ITER; i++) {
        MPI_Request req;
        int val = -1, exp = -1;

        /* Processes form a ring.  Process 0 starts first, then passes a token
         * to the right.  Each process, in turn, performs third-party
         * communication via process 0's window. */
        if (rank > 0) {
            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

        MPI_Rget(&val, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
        assert(req != MPI_REQUEST_NULL);
        MPI_Wait(&req, MPI_STATUS_IGNORE);

        MPI_Rput(&rank, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
        assert(req != MPI_REQUEST_NULL);
        MPI_Wait(&req, MPI_STATUS_IGNORE);

        MPI_Win_flush(0, window);

        exp = (rank + nproc-1) % nproc;

        if (val != exp) {
            printf("%d - Got %d, expected %d\n", rank, val, exp);

        if (rank < nproc-1) {
            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);



    if (rank == 0) *buf = nproc-1;

    /* GET+ACC: Test third-party communication, through rank 0. */
    for (i = 0; i < ITER; i++) {
        MPI_Request req;
        int val = -1, exp = -1;

        /* Processes form a ring.  Process 0 starts first, then passes a token
         * to the right.  Each process, in turn, performs third-party
         * communication via process 0's window. */
        if (rank > 0) {
            MPI_Recv(NULL, 0, MPI_BYTE, rank-1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

        MPI_Rget(&val, 1, MPI_INT, 0, 0, 1, MPI_INT, window, &req);
        assert(req != MPI_REQUEST_NULL);
        MPI_Wait(&req, MPI_STATUS_IGNORE);

        MPI_Raccumulate(&rank, 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_REPLACE, window, &req);
        assert(req != MPI_REQUEST_NULL);
        MPI_Wait(&req, MPI_STATUS_IGNORE);

        MPI_Win_flush(0, window);

        exp = (rank + nproc-1) % nproc;

        if (val != exp) {
            printf("%d - Got %d, expected %d\n", rank, val, exp);

        if (rank < nproc-1) {
            MPI_Send(NULL, 0, MPI_BYTE, rank+1, 0, MPI_COMM_WORLD);

    MPI_Win_unlock(0, window);


    /* Wait inside of an epoch */
        MPI_Request pn_req[4];
        int val[4], res;
        const int target = 0;

        MPI_Win_lock_all(0, window);

        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);

        assert(pn_req[0] != MPI_REQUEST_NULL);
        assert(pn_req[1] != MPI_REQUEST_NULL);
        assert(pn_req[2] != MPI_REQUEST_NULL);
        assert(pn_req[3] != MPI_REQUEST_NULL);

        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);



    /* Wait outside of an epoch */
        MPI_Request pn_req[4];
        int val[4], res;
        const int target = 0;

        MPI_Win_lock_all(0, window);

        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);

        assert(pn_req[0] != MPI_REQUEST_NULL);
        assert(pn_req[1] != MPI_REQUEST_NULL);
        assert(pn_req[2] != MPI_REQUEST_NULL);
        assert(pn_req[3] != MPI_REQUEST_NULL);


        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);

    /* Wait in a different epoch */
        MPI_Request pn_req[4];
        int val[4], res;
        const int target = 0;

        MPI_Win_lock_all(0, window);

        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);

        assert(pn_req[0] != MPI_REQUEST_NULL);
        assert(pn_req[1] != MPI_REQUEST_NULL);
        assert(pn_req[2] != MPI_REQUEST_NULL);
        assert(pn_req[3] != MPI_REQUEST_NULL);


        MPI_Win_lock_all(0, window);
        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);

    /* Wait in a fence epoch */
        MPI_Request pn_req[4];
        int val[4], res;
        const int target = 0;

        MPI_Win_lock_all(0, window);

        MPI_Rget_accumulate(&val[0], 1, MPI_INT, &res, 1, MPI_INT, target, 0, 1, MPI_INT, MPI_REPLACE, window, &pn_req[0]);
        MPI_Rget(&val[1], 1, MPI_INT, target, 1, 1, MPI_INT, window, &pn_req[1]);
        MPI_Rput(&val[2], 1, MPI_INT, target, 2, 1, MPI_INT, window, &pn_req[2]);
        MPI_Raccumulate(&val[3], 1, MPI_INT, target, 3, 1, MPI_INT, MPI_REPLACE, window, &pn_req[3]);

        assert(pn_req[0] != MPI_REQUEST_NULL);
        assert(pn_req[1] != MPI_REQUEST_NULL);
        assert(pn_req[2] != MPI_REQUEST_NULL);
        assert(pn_req[3] != MPI_REQUEST_NULL);


        MPI_Win_fence(0, window);
        MPI_Waitall(4, pn_req, MPI_STATUSES_IGNORE);
        MPI_Win_fence(0, window);

    if (buf) MPI_Free_mem(buf);

    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0 && all_errors == 0)
        printf(" No Errors\n");


    return 0;
Example #5
dart_ret_t dart_get_handle(
  void          * dest,
  dart_gptr_t     gptr,
  size_t          nbytes,
  dart_handle_t * handle)
  MPI_Request  mpi_req;
  MPI_Aint     disp_s,
  MPI_Datatype mpi_type;
  MPI_Win      win;
  dart_unit_t  target_unitid_abs = gptr.unitid;
  dart_unit_t  target_unitid_rel = target_unitid_abs;
  int          mpi_ret;
  uint64_t     offset = gptr.addr_or_offs.offset;
  uint16_t     index  = gptr.flags;
  int16_t      seg_id = gptr.segid;
   * MPI uses offset type int, do not copy more than INT_MAX elements:
  if (nbytes > INT_MAX) {
    DART_LOG_ERROR("dart_get_handle ! failed: nbytes > INT_MAX");
    return DART_ERR_INVAL;
  int n_count = (int)(nbytes);

  mpi_type = MPI_BYTE;

  *handle = (dart_handle_t) malloc(sizeof(struct dart_handle_struct));

  if (seg_id > 0) {
    unit_g2l(index, target_unitid_abs, &target_unitid_rel);
  DART_LOG_DEBUG("dart_get_handle() uid_abs:%d uid_rel:%d "
                 "o:%"PRIu64" s:%d i:%d, nbytes:%zu",
                 target_unitid_abs, target_unitid_rel,
                 offset, seg_id, index, nbytes);
  DART_LOG_TRACE("dart_get_handle:  allocated handle:%p", (void *)(*handle));

  DART_LOG_DEBUG("dart_get_handle: shared windows enabled");
  if (seg_id >= 0) {
    int       i;
    char *    baseptr;
     * Use memcpy if the target is in the same node as the calling unit:
    i = dart_sharedmem_table[index][gptr.unitid];
    if (i >= 0) {
      DART_LOG_DEBUG("dart_get_handle: shared memory segment, seg_id:%d",
      if (seg_id) {
        if (dart_adapt_transtable_get_baseptr(seg_id, i, &baseptr) == -1) {
          DART_LOG_ERROR("dart_get_handle ! "
                         "dart_adapt_transtable_get_baseptr failed");
          return DART_ERR_INVAL;
      } else {
        baseptr = dart_sharedmem_local_baseptr_set[i];
      baseptr += offset;
      DART_LOG_DEBUG("dart_get_handle: memcpy %zu bytes", nbytes);
      memcpy((char*)dest, baseptr, nbytes);

       * Mark request as completed:
      (*handle)->request = MPI_REQUEST_NULL;
      if (seg_id != 0) {
        (*handle)->dest = target_unitid_rel;
        (*handle)->win  = dart_win_lists[index];
      } else {
        (*handle)->dest = target_unitid_abs;
        (*handle)->win  = dart_win_local_alloc;
      return DART_OK;
  DART_LOG_DEBUG("dart_get_handle: shared windows disabled");
#endif /* !defined(DART_MPI_DISABLE_SHARED_WINDOWS) */
   * MPI shared windows disabled or target and calling unit are on different
   * nodes, use MPI_RGet:
  if (seg_id != 0) {
     * The memory accessed is allocated with collective allocation.
    DART_LOG_TRACE("dart_get_handle:  collective, segment:%d", seg_id);
    win = dart_win_lists[index];
    /* Translate local unitID (relative to teamid) into global unitID
     * (relative to DART_TEAM_ALL).
     * Note: target_unitid should not be the global unitID but rather the
     * local unitID relative to the team associated with the specified win
     * object.
    if (dart_adapt_transtable_get_disp(
          &disp_s) == -1)
        "dart_get_handle ! dart_adapt_transtable_get_disp failed");
      return DART_ERR_INVAL;
    disp_rel = disp_s + offset;
    DART_LOG_TRACE("dart_get_handle:  -- disp_s:%"PRId64" disp_rel:%"PRId64"",
                   disp_s, disp_rel);

    /* TODO: Check if
     *    MPI_Rget_accumulate(
     *      NULL, 0, MPI_BYTE, dest, nbytes, MPI_BYTE,
     *      target_unitid, disp_rel, nbytes, MPI_BYTE, MPI_NO_OP, win,
     *      &mpi_req)
     *  ... could be an better alternative?
    DART_LOG_DEBUG("dart_get_handle:  -- %d elements (collective allocation) "
                   "from %d at offset %"PRIu64"",
                   n_count, target_unitid_rel, offset);
    DART_LOG_DEBUG("dart_get_handle:  -- MPI_Rget");
    mpi_ret = MPI_Rget(
                dest,              // origin address
                n_count,           // origin count
                mpi_type,          // origin data type
                target_unitid_rel, // target rank
                disp_rel,          // target disp in window
                n_count,           // target count
                mpi_type,          // target data type
                win,               // window
    if (mpi_ret != MPI_SUCCESS) {
      DART_LOG_ERROR("dart_get_handle ! MPI_Rget failed");
      return DART_ERR_INVAL;
    (*handle)->dest = target_unitid_rel;
  } else {
     * The memory accessed is allocated with local allocation.
    DART_LOG_TRACE("dart_get_handle:  -- local, segment:%d", seg_id);
    DART_LOG_DEBUG("dart_get_handle:  -- %d elements (local allocation) "
                   "from %d at offset %"PRIu64"",
                   n_count, target_unitid_abs, offset);
    win     = dart_win_local_alloc;
    DART_LOG_DEBUG("dart_get_handle:  -- MPI_Rget");
    mpi_ret = MPI_Rget(
                dest,              // origin address
                n_count,           // origin count
                mpi_type,          // origin data type
                target_unitid_abs, // target rank
                offset,            // target disp in window
                n_count,           // target count
                mpi_type,          // target data type
                win,               // window
    if (mpi_ret != MPI_SUCCESS) {
      DART_LOG_ERROR("dart_get_handle ! MPI_Rget failed");
      return DART_ERR_INVAL;
    (*handle)->dest = target_unitid_abs;
  (*handle)->request = mpi_req;
  (*handle)->win     = win;
  DART_LOG_TRACE("dart_get_handle > handle(%p) dest:%d win:%"PRIu64" req:%d",
                 (void*)(*handle), (*handle)->dest, (uint64_t)win, mpi_req);
  return DART_OK;
Example #6
FORT_DLL_SPEC void FORT_CALL mpi_rget_ ( void*v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *ierr ){
    *ierr = MPI_Rget( v1, (int)*v2, (MPI_Datatype)(*v3), (int)*v4, (MPI_Aint)*v5, (int)*v6, (MPI_Datatype)(*v7), (MPI_Win)*v8, (MPI_Request *)(v9) );
Example #7
int main(int argc, char **argv)
    FILE    *fp, *fp2;
    char    testName[32] = "MPI_Rget", file1[64], file2[64];
    int     dblSize, proc, nprocs, npairs, partner;
    unsigned int i, j, k, size, localSize, NLOOP = NLOOP_MAX;
    unsigned int smin = MIN_P2P_SIZE, smed = MED_P2P_SIZE, smax = MAX_P2P_SIZE;
    double  tScale = USEC, bwScale = MB_8;
    double  tStart, timeMin, timeMinGlobal, overhead, threshold_lo, threshold_hi;
    double  msgBytes, sizeBytes, localMax, UsedMem;
    double  tElapsed[NREPS], tElapsedGlobal[NREPS];
    double  *A, *B;
    MPI_Win     win;
    MPI_Status  stat;
    MPI_Request req;

    // Initialize parallel environment
    MPI_Init(&argc, &argv);
    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
    MPI_Comm_rank( MPI_COMM_WORLD, &proc );

    // Test input parameters
    if( nprocs%2 != 0 && proc == 0 )
        fatalError( "P2P test requires an even number of processors" );

    // Check for user defined limits
    checkEnvP2P( proc, &NLOOP, &smin, &smed, &smax );

    // Initialize local variables
    localMax = 0.0;
    npairs   = nprocs/2;
    if( proc < npairs  ) partner = proc + npairs;
    if( proc >= npairs ) partner = proc - npairs;
    UsedMem = (double)smax*(double)sizeof(double)*2.0;

    // Allocate and initialize arrays
    srand( SEED );
    A = doubleVector( smax );
    B = doubleVector( smax );

    // Open output file and write header
    if( proc == 0 ){
        // Check timer overhead in seconds
        timerTest( &overhead, &threshold_lo, &threshold_hi );
        // Open output files and write headers
        sprintf( file1, "rget_time-np_%.4d.dat", nprocs );
        sprintf( file2, "rget_bw-np_%.4d.dat",   nprocs );
        fp  = fopen( file1, "a" );
        fp2 = fopen( file2, "a" );
        printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );

    // Get type size
    MPI_Type_size( MPI_DOUBLE, &dblSize );
    // Set up a window for RMA
    MPI_Win_create( A, smax*dblSize, dblSize, MPI_INFO_NULL, MPI_COMM_WORLD, &win );
    MPI_Win_lock_all( 0, win );
    // Single loop with minimum size to verify that inner loop length  
    // is long enough for the timings to be accurate                     
    // Warmup with a medium size message
    if( proc < npairs ){
        MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
        MPI_Wait( &req, &stat );
        MPI_Win_flush_all( win );
    // Test if current NLOOP is enough to capture fastest test cases
    MPI_Barrier( MPI_COMM_WORLD );
    tStart = benchTimer();
    if( proc < npairs ){
        for(j = 0; j < NLOOP; j++){
        	MPI_Rget( B, smin, MPI_DOUBLE, partner, 0, smin, MPI_DOUBLE, win, &req );
        	MPI_Wait( &req, &stat );
        	MPI_Win_flush_all( win );
    timeMin = benchTimer() - tStart;
    MPI_Reduce( &timeMin, &timeMinGlobal, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD );
    if( proc == 0 ) resetInnerLoop( timeMinGlobal, threshold_lo, &NLOOP );
    MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );

    // Execute test for each requested size                  
    for( size = smin; size <= smax; size = size*2 ){

        // Warmup with a medium size message
        if( proc < npairs ){
            MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
            MPI_Wait( &req, &stat );
            MPI_Win_flush_all( win );

        // Repeat NREPS to collect statistics
        for(i = 0; i < NREPS; i++){
            MPI_Barrier( MPI_COMM_WORLD );
            tStart = benchTimer();
            if( proc < npairs ){
                for(j = 0; j < NLOOP; j++){
        	        MPI_Rget( B, size, MPI_DOUBLE, partner, 0, size, MPI_DOUBLE, win, &req );
        	        MPI_Wait( &req, &stat );
        	        MPI_Win_flush_all( win );
        	tElapsed[i] = benchTimer() - tStart;
        MPI_Reduce( tElapsed, tElapsedGlobal, NREPS, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
        // Only task 0 needs to do the analysis of the collected data
        if( proc == 0 ){
            // sizeBytes is size to write to file
            // msgBytes is actual data exchanged on the wire
            msgBytes  = (double)size*(double)npairs*(double)dblSize;
            sizeBytes = (double)size*(double)dblSize;
            post_process( fp, fp2, threshold_hi, tElapsedGlobal, tScale, 
                          bwScale, size*dblSize, sizeBytes, msgBytes, &NLOOP, 
                          &localMax, &localSize );
        MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );

    MPI_Win_unlock_all( win );
    MPI_Win_free( &win );
    MPI_Barrier( MPI_COMM_WORLD );
    free( A );
    free( B );

    // Print completion message, free memory and exit                  
    if( proc == 0 ){
        printSummary( fp2, testName, localMax, localSize );
        fclose( fp2 ); 
        fclose( fp );

    return 0;