Пример #1
0
static void esync_master(VT_MPI_INT slave, MPI_Comm comm, VT_MPI_INT masterid)
{
  int i;
   
  uint64_t tsend, trecv, tslave;
  uint64_t t1, t2, t3, t4;
   
  MPI_Status stat;
  MPI_Request req;
  Sync_TsPerPhase* temp;
   
  /* exchange LOOP_COUNT ping pong messages with the communication partner */
   
  t1 = vt_pform_wtime();
  PMPI_Isend( &t1, 1, MPI_LONG_LONG_INT, slave, 0, comm, &req );
  PMPI_Recv( &t2, 1, MPI_LONG_LONG_INT, slave, 0, comm, &stat );
  t4 = vt_pform_wtime();
  t3 = t2;
  PMPI_Waitall( 1, &req, &stat );
   
  for( i = 1; i < LOOP_COUNT; i++ )
  {
    tsend = vt_pform_wtime();
      
    /* message exchange */

    PMPI_Isend(&tsend, 1, MPI_LONG_LONG_INT, slave, i, comm, &req);
    PMPI_Recv(&tslave, 1, MPI_LONG_LONG_INT, slave, i, comm, &stat);
    trecv = vt_pform_wtime();
      
    PMPI_Waitall(1, &req, &stat);

    /* select timestamps with minimum message delay in each direction */

    if ( ( (int64_t)tslave - (int64_t)tsend ) < ( (int64_t)t2 - (int64_t)t1 ) )
    {
      t1 = tsend;
      t2 = tslave;
    }
    if ( ( (int64_t)trecv - (int64_t)tslave ) < ( (int64_t)t4 - (int64_t)t3 ) )
    {
      t3 = tslave;
      t4 = trecv;
    }
  }

  /* save synchronization measurement data into internal data structure */

  temp = (Sync_TsPerPhase*)malloc(sizeof(Sync_TsPerPhase));
  if (!temp) vt_error();
  temp->id1  = masterid;
  temp->id2  = slave;
  temp->t1   = t1;
  temp->t2   = t2;
  temp->t3   = t3;
  temp->t4   = t4;
  temp->next = SyncTsPerRunLast->sync_phase;
  SyncTsPerRunLast->sync_phase = temp;
}
Пример #2
0
int  MPI_Isend(MPE_CONST void *buf, int count, MPI_Datatype datatype,
	       int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
  int  returnVal;
  request_list *newrq;
  int typesize3;

  
  
/* fprintf( stderr, "MPI_Isend call on %d\n", procid_1 ); */
  
  returnVal = PMPI_Isend( buf, count, datatype, dest, tag, comm, request );

  if (dest != MPI_PROC_NULL) {
      rq_alloc( requests_avail_1, newrq );
    if (newrq) {
      PMPI_Type_size( datatype, &typesize3 );
      newrq->request = *request;
      newrq->status = RQ_SEND;
      newrq->size = count * typesize3;
      newrq->tag = tag;
      newrq->otherParty = dest;
      newrq->next = 0;
      rq_add( requests_head_1, requests_tail_1, newrq );
    }
  }

  return returnVal;
}
Пример #3
0
int MPI_Isend(const void* buffer, int count, MPI_Datatype datatype,
              int dst, int tag, MPI_Comm comm, MPI_Request* request)
{
    cqueue_t* mycqueue = handle_get_cqueue(comm);

    if (mycqueue != NULL)
        return cqueue_isend(mycqueue, buffer, count, datatype, dst, tag, comm, request);
    else
    {
	if (std_mpi_mode == STD_MPI_MODE_IMPLICIT && max_ep > 0)
	    return cqueue_isend(client_get_cqueue((taskid + dst) % max_ep), buffer,
				count, datatype, dst, tag, comm, request);
        return PMPI_Isend(buffer, count, datatype, dst, tag, comm, request);
    }
}
Пример #4
0
static int MTCORE_Send_pscw_post_msg(int post_grp_size, MTCORE_Win * uh_win)
{
    int mpi_errno = MPI_SUCCESS;
    int i, user_rank;
    char post_flg = 1;
    MPI_Request *reqs = NULL;
    MPI_Status *stats = NULL;
    int remote_cnt = 0;

    reqs = calloc(post_grp_size, sizeof(MPI_Request));
    stats = calloc(post_grp_size, sizeof(MPI_Status));

    PMPI_Comm_rank(uh_win->user_comm, &user_rank);

    for (i = 0; i < post_grp_size; i++) {
        int origin_rank = uh_win->post_ranks_in_win_group[i];

        /* Do not send to local target, otherwise it may deadlock.
         * We do not check the wrong sync case that user calls start(self)
         * before post(self). */
        if (user_rank == origin_rank)
            continue;

        mpi_errno = PMPI_Isend(&post_flg, 1, MPI_CHAR, origin_rank,
                               MTCORE_PSCW_PS_TAG, uh_win->user_comm, &reqs[remote_cnt++]);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;

        /* Set post flag to true on the main helper of post origin. */
        MTCORE_DBG_PRINT("send pscw post msg to origin %d \n", origin_rank);
    }

    /* Has to blocking wait here to poll progress. */
    mpi_errno = PMPI_Waitall(remote_cnt, reqs, stats);
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

  fn_exit:
    if (reqs)
        free(reqs);
    if (stats)
        free(stats);
    return mpi_errno;

  fn_fail:
    goto fn_exit;
}
Пример #5
0
int MPI_Isend(const void *buf, int count, MPI_Datatype type, int dest,
              int tag, MPI_Comm comm, MPI_Request *request)
{
    char typename[MPI_MAX_OBJECT_NAME], commname[MPI_MAX_OBJECT_NAME];
    int len;
    int rank;
    
    PMPI_Comm_rank(MPI_COMM_WORLD, &rank);    
    PMPI_Type_get_name(type, typename, &len);
    PMPI_Comm_get_name(comm, commname, &len);
    
    fprintf(stderr, "MPI_ISEND[%d]: buf %0" PRIxPTR " count %d datatype %s dest %d tag %d comm %s\n",
           rank, (uintptr_t) buf, count, typename, dest, tag, commname);
    fflush(stderr);
    
    return PMPI_Isend(buf, count, type, dest, tag, comm, request);
}
Пример #6
0
int MPI_Send_Nospin( void *buff, const int count, MPI_Datatype datatype,
                     const int dest, const int tag, MPI_Comm comm )
{
    MPI_Request req;
    PMPI_Isend( buff, count, datatype, dest, tag, comm, &req );

    MPI_Status status;
    timespec ts{ 0, nsec_start };
    int flag = 0;
    while ( !flag )
    {
        nanosleep( &ts, nullptr );
        ts.tv_nsec = std::min( size_t(ts.tv_nsec << 1), nsec_max );
        PMPI_Request_get_status( req, &flag, &status );
    }

    return status.MPI_ERROR;
}
Пример #7
0
int MPI_Isend( void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
    MPI_Status status;
    double start, finish;
    int realsize;

    // actual send
    PMPI_Isend(buf, count, datatype, dest, tag, comm, request);

    // get datatype size
    MPI_Type_size(datatype, &realsize);
    realsize *= count;

    // record send count
    my_send_count[dest]++;

    // record send size
    my_send_size[dest] += realsize;


    return 0;
}
Пример #8
0
int MPI_Isend(MPI_CONST void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm,
	      MPI_Request *req)
{
  int done; 
  PNMPIMOD_Datatype_Parameters_t ref;
  char *b;
  int l,s;
  MPI_Datatype t;

  r_get(buf, count, datatype, &ref);

  printf("Sending to %i :\n",dest);
  do
    {
      PNMPIMOD_Datatype_getItem(&ref,&b,&t,&l,&s,&done)
#ifdef USE_FUNCTIONS
	;
#endif
      printf("\t%i ",l);
      if (t==MPI_INT) printf("INT   ");
      else if (t==MPI_SHORT) printf("SHORT ");
      else if (t==MPI_LONG) printf("LONG  ");
      else if (t==MPI_CHAR) printf("CHAR  ");
      else if (t==MPI_DOUBLE) printf("DOUBLE");
      else if (t==MPI_FLOAT) printf("FLOAT ");
      else printf("Other");

      printf(" of size %i at buf %16p / %li\n",s,b,((long) b)-((long) buf));
      fflush(stdout);
    }
  while (!done);

  r_del(&ref);

  return PMPI_Isend(buf,count,datatype,dest,tag,comm,req);
}
Пример #9
0
int main(int argc, char *argv[]) {

    int   numproc, rank, len;
    char  hostname[MPI_MAX_PROCESSOR_NAME];

    PMPI_Init(&argc, &argv);
    PMPI_Comm_size(MPI_COMM_WORLD, &numproc);
    PMPI_Comm_rank(MPI_COMM_WORLD, &rank);
    PMPI_Get_processor_name(hostname, &len);

    if (rank==0) {
        int *freq,i,j;
        freq=(int *)malloc(sizeof(int)*numproc);
        char *temp;
        temp=(char*)malloc(sizeof(char)*(numproc-1));
        MPI_Status *stat, *stat1;
        stat = (MPI_Status*)malloc(sizeof(MPI_Status)*(numproc-1));
        stat1 = (MPI_Status*)malloc(sizeof(MPI_Status)*(numproc-1));
        MPI_Request *req;
        req = (MPI_Request *)malloc(sizeof(MPI_Request)*(numproc-1));
        int N=numproc*numproc;

        for(i=1; i<numproc; i++) {
            PMPI_Recv(temp+i-1, 1, MPI_CHAR, i, 0, MPI_COMM_WORLD, stat+(i-1));//, req+(i-1)*2);
        }

        for(i=1; i<numproc; i++) {
            PMPI_Recv(freq+i*numproc, numproc, MPI_INT, i, 1, MPI_COMM_WORLD,
                      stat1+(i-1));
        }

        printf("echo\n");
        // MPI_Waitall((numproc-1), req, stat);
        for (i=1; i<numproc; i++) {
            printf("Rank %d ", i);
            for (j=0; j<numproc; j++) {
                if(j!=i) {
                    int loc = i*numproc+j;
                    printf("%d ",freq[loc]);
                }
            }
            printf("\n");
        }
    }

    else {
        int i, *nsend;
        char *rMsg, msg='x';
        rMsg=(char*)malloc(sizeof(char));
        nsend=(int*)malloc(sizeof(int)*numproc);
        // msg=(char*)malloc(sizeof(char));
        // memset(msg, 'z', sizeof(char));
        memset(nsend, 0, sizeof(int)*numproc);
        MPI_Request *req;
        req = (MPI_Request *)malloc(sizeof(MPI_Request)*(numproc));
        MPI_Status *stat;
        stat = (MPI_Status*)malloc(sizeof(MPI_Status)*(numproc-1));
        for (i=0; i<numproc; i++) {
            if(i!=rank) {
                *(nsend+i)+=*(nsend+i)+1;
                PMPI_Isend(&msg, 1, MPI_CHAR, i, 0, MPI_COMM_WORLD, &(req[i]));
            }
        }
        // printf("Echo-1\n");
        for (i=1; i<numproc; i++) {
            if (i!=rank)
                PMPI_Recv(rMsg, 1, MPI_CHAR, i, 0, MPI_COMM_WORLD, stat+i-1);
        }
        // printf("Echo-2\n");
        MPI_Isend(nsend, numproc, MPI_INT, 0, 1, MPI_COMM_WORLD, req+numproc);
        // MPI_Isend(msg, 1, MPI_CHAR, i, 0, MPI_COMM_WORLD, req+numproc);
        // printf("Echo-3\n");
    }
    PMPI_Finalize();
    return(0);
}
Пример #10
0
/* Same behavior as PMPI_Irsend.c */
int PMPI_Issend (void* message, int count, MPI_Datatype datatype, int dest,
        int tag, MPI_Comm comm, MPI_Request* request) {
  return PMPI_Isend(message,count,datatype,dest,tag,comm,request);
}
Пример #11
0
int main(int argc, char** argv) {

    MPI_Init(&argc, &argv);

    int rank, peer, commsize;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &commsize);
    if (rank % 2) peer = rank - 1 % commsize;
    else peer = rank + 1 % commsize;

    if (commsize % 2 != 0) {
        fprintf(stderr, "Use even number of processes.\n");
        exit(EXIT_FAILURE);
    }

    char* mpi_inbuf;
    char* mpi_outbuf;
    char* pmpi_inbuf;
    char* pmpi_outbuf;

    test_start("isend/irecv + test (2, vector[[int], count=2, blklen=3, stride=5])");
    init_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);

    MPI_Datatype vector_ddt;
    MPI_Type_vector(2, 3, 5, MPI_INT, &vector_ddt);
    MPI_Type_commit(&vector_ddt);

    MPI_Datatype pmpi_vector_ddt;
    PMPI_Type_vector(2, 3, 5, MPI_INT, &pmpi_vector_ddt);
    PMPI_Type_commit(&pmpi_vector_ddt);

    MPI_Request requests_mpi[2];
    MPI_Request requests_pmpi[2];
    MPI_Status statuses_mpi[2]; 
    MPI_Status statuses_pmpi[2];

    if (rank % 2 == 0) {
        MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0]));
        MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1]));

        PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0]));
        PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1]));       
    }
    else {
        MPI_Irecv(mpi_outbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[0]));       
        MPI_Isend(mpi_inbuf, 2, vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_mpi[1]));

        PMPI_Irecv(pmpi_outbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[0]));       
        PMPI_Isend(pmpi_inbuf, 2, pmpi_vector_ddt, peer, 0, MPI_COMM_WORLD, &(requests_pmpi[1]));
    }

    int flag;
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_mpi[0]), &flag, &(statuses_mpi[0]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_mpi[1]), &flag, &(statuses_mpi[1]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_pmpi[0]), &flag, &(statuses_pmpi[0]));
    flag = 0;
    while (flag == 0) MPI_Test(&(requests_pmpi[1]), &flag, &(statuses_pmpi[1]));

    int res = compare_buffers(20*sizeof(int), &mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);
    free_buffers(&mpi_inbuf, &pmpi_inbuf, &mpi_outbuf, &pmpi_outbuf);
    test_result(res);

    MPI_Type_free(&vector_ddt);
    PMPI_Type_free(&pmpi_vector_ddt);

    MPI_Finalize();

}
Пример #12
0
int main(int argc, char **argv) {

    /* Validate arguments */
    if (argc != 5) {
        fprintf(stderr, "Usage: %s [input file] [output file] [grid size] [iterations]\n", argv[0]);
        return 1;
    }

    double t1 = MPI_Wtime();

    /* Initialize MPI */
    int tasks, rank;
    MPI_Init(NULL, NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &tasks);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    double t_init = MPI_Wtime();

    /* Declare global variables */
    double *initial = NULL;
    int N = atoi(argv[3]);
    int T = atoi(argv[4]);

    /*****************************************************************/
    /* READ DATA                                                     */
    /*****************************************************************/
    if (rank == 0) {
        if (DEBUG) {
            fprintf(stderr, "\n-------------------------------------------\n");
            fprintf(stderr, "Starting heat.c with %d processes\n", tasks);
        }

        /* Read data */
        FILE *in = fopen(argv[1], "r");
        initial = malloc(N * N * sizeof(double));

        if (DEBUG) {
            for (int i = 1; i <= N; i++) {
                for (int j = 1; j <= N; j++) {
                    double val = (double) (i * (N - i - 1) * j * (N - j - 1));
                    initial[(i - 1) * N + (j - 1)] = val;
                }
            }
        } else {
            for (int i = 0; i < N * N; i++) {
                int x, y;
                double z;
                fscanf(in, "%d %d %lf\n", &x, &y, &z);
                initial[(x - 1) * N + (y - 1)] = z;
            }
        }

        fclose(in);
    }

    double t_read = MPI_Wtime();

    /*****************************************************************/
    /* DISTRIBUTE DATA                                               */
    /*****************************************************************/
    /* Preliminaries */
    int rowsPerWorker = N / tasks;
    int extra = N % tasks;

    /* Determine neighboring workers */
    int pred = rank - 1;
    int succ = rank + 1;

    /* Determine how many values each worker will get */
    int offset = 0;
    int offsets[tasks];
    int items[tasks];

    for (int i = 0; i < tasks; i++) {
        items[i] = rowsPerWorker * N;
        if (i < extra) items[i] += N;

        offsets[i] = offset;
        offset += items[i];
    }

    /* Scatter the rows appropriately */
    int myrows = items[rank] / N;

    /* Allocate an extra row of padding on either end */
    double *current = calloc((items[rank] + 2 * N), sizeof(double));
    double *old     = calloc((items[rank] + 2 * N), sizeof(double));

    if (DEBUG && rank == 0) fprintf(stderr, "scattering...\n");

    PMPI_Scatterv(initial, items, offsets, MPI_DOUBLE,
                  current + N, items[rank], MPI_DOUBLE, 0, MPI_COMM_WORLD);

    double t_scatter = MPI_Wtime();

    /*****************************************************************/
    /* CALCULATE                                                     */
    /*****************************************************************/
    MPI_Request req;
    double t_net = 0;

    for (int t = 0; t < T; t++) {
        if (DEBUG) fprintf(stderr, "Beginning iteration %d: rank %d\n", t, rank);

        /* Swap old and current so we can overwrite current */
        double *temp = current;
        current = old;
        old = temp;

        /* Hold onto some useful pointers into old */
        double *succrow  = old + N * (myrows + 1);
        double *predrow  = old;
        double *firstrow = old + N;
        double *lastrow  = old + myrows;
        double t_temp = MPI_Wtime();

        /* Send last row to succ and receive it from pred if eligible */
        if (succ < tasks) {
            PMPI_Isend(lastrow, N, MPI_DOUBLE,
                       succ, 0, MPI_COMM_WORLD, &req);
        }
        if (pred >= 0) {
            PMPI_Recv(predrow, N, MPI_DOUBLE, pred, 0, MPI_COMM_WORLD, 0);
        }

        /* Send first row to pred and receive it from succ if eligible */
        if (pred >= 0) {
            PMPI_Isend(firstrow, N, MPI_DOUBLE, pred, 0, MPI_COMM_WORLD, &req);
        }
        if (succ < tasks) {
            PMPI_Recv(succrow, N, MPI_DOUBLE, succ, 0, MPI_COMM_WORLD, 0);
        }

        t_net += MPI_Wtime() - t_temp;

        /* Determine current from old, predrow, and succrow */
        for (int j = 1; j <= myrows; j++) {
            for (int k = 0; k < N; k++) {

                /* Determine adjacent cells */
                double left = 0, right = 0;
                if (k > 0    ) left  = old[j * N + k - 1];
                if (k < N - 1) right = old[j * N + k + 1];

                double top    = old[(j - 1) * N + k];
                double bottom = old[(j + 1) * N + k];
                double focus  = old[j * N + k];

                /* Calculate the new cell value */
                current[j * N + k] = focus + .1 * (top + bottom - 2 * focus)
                                     + .1 * (left + right - 2 * focus);
            }

        }
    }

    free(old);
    double t_work = MPI_Wtime();

    /*****************************************************************/
    /* WRITE THE OUTPUT                                              */
    /*****************************************************************/
    PMPI_Gatherv(current + N, items[rank], MPI_DOUBLE, initial,
                 items, offsets, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    double t_gather = MPI_Wtime();
    free(current);

    if (rank == 0 && !DEBUG) {
        FILE *out = fopen(argv[2], "w");

        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                fprintf(out, "%d %d %lf\n", i, j, initial[i * N + j]);
            }
        }
        fclose(out);
    }

    double t2 = MPI_Wtime();

    if (rank == 0) {
        fprintf(stderr, "-----------------------------------------\n");
        fprintf(stderr, "TIMING INFORMATION                       \n");
        fprintf(stderr, "-----------------------------------------\n");
        fprintf(stderr, "RANK     INIT     READ  SCATTER     WORK   GATHER    WRITE    TOTAL     NET\n");
    }

    MPI_Barrier(MPI_COMM_WORLD);

    fprintf(stderr, "%4.2d %8.2f %8.2f %8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n",
            rank, t_init - t1, t_read - t_init, t_scatter - t_read, t_work - t_scatter,
            t_gather - t_work, t2 - t_gather, t2 - t1, t_net);

    free(initial);

    MPI_Finalize();
    return 0;
}
Пример #13
0
int MPI_Isend(void *buf, int count, MPI_Datatype datatype, int dst,
              int tag, MPI_Comm comm, MPI_Request * request)
{
  return PMPI_Isend(buf, count, datatype, dst, tag, comm, request);
}
Пример #14
0
int MPI_Win_free(MPI_Win * win)
{
    static const char FCNAME[] = "MTCORE_Win_free";
    int mpi_errno = MPI_SUCCESS;
    MTCORE_Win *uh_win;
    int user_rank, user_nprocs, user_local_rank, user_local_nprocs;
    int i, j;
    MPI_Request *reqs = NULL;
    MPI_Status *stats = NULL;

    MTCORE_DBG_PRINT_FCNAME();

    MTCORE_Fetch_uh_win_from_cache(*win, uh_win);

    if (uh_win == NULL) {
        /* normal window */
        return PMPI_Win_free(win);
    }

    /* mtcore window starts */

    PMPI_Comm_rank(uh_win->user_comm, &user_rank);
    PMPI_Comm_size(uh_win->user_comm, &user_nprocs);
    PMPI_Comm_rank(uh_win->local_user_comm, &user_local_rank);
    PMPI_Comm_size(uh_win->local_user_comm, &user_local_nprocs);

    /* First unlock global active window */
    if ((uh_win->info_args.epoch_type & MTCORE_EPOCH_FENCE) ||
        (uh_win->info_args.epoch_type & MTCORE_EPOCH_PSCW)) {

        MTCORE_DBG_PRINT("[%d]unlock_all(active_win 0x%x)\n", user_rank, uh_win->active_win);

        /* Since all processes must be in win_free, we do not need worry
         * the possibility losing asynchronous progress. */
        mpi_errno = PMPI_Win_unlock_all(uh_win->active_win);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (user_local_rank == 0) {
        MTCORE_Func_start(MTCORE_FUNC_WIN_FREE, user_nprocs, user_local_nprocs);
    }

    /* Notify the handle of target Helper win. It is noted that helpers cannot
     * fetch the corresponding window without handlers so that only global communicator
     * can be used here.*/
    if (user_local_rank == 0) {
        reqs = calloc(MTCORE_ENV.num_h, sizeof(MPI_Request));
        stats = calloc(MTCORE_ENV.num_h, sizeof(MPI_Status));

        for (j = 0; j < MTCORE_ENV.num_h; j++) {
            mpi_errno = PMPI_Isend(&uh_win->h_win_handles[j], 1, MPI_UNSIGNED_LONG,
                                   MTCORE_H_RANKS_IN_LOCAL[j], 0, MTCORE_COMM_LOCAL, &reqs[j]);
        }
        mpi_errno = PMPI_Waitall(MTCORE_ENV.num_h, reqs, stats);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    /* Free uh_win before local_uh_win, because all the incoming operations
     * should be done before free shared buffers.
     *
     * We do not need additional barrier in Manticore for waiting all
     * operations complete, because Win_free already internally add a barrier
     * for waiting operations on that window complete.
     */
    if (uh_win->num_uh_wins > 0 && uh_win->uh_wins) {
        MTCORE_DBG_PRINT("\t free uh windows\n");
        for (i = 0; i < uh_win->num_uh_wins; i++) {
            if (uh_win->uh_wins[i]) {
                mpi_errno = PMPI_Win_free(&uh_win->uh_wins[i]);
                if (mpi_errno != MPI_SUCCESS)
                    goto fn_fail;
            }
        }
    }

    if (uh_win->active_win) {
        MTCORE_DBG_PRINT("\t free active window\n");
        mpi_errno = PMPI_Win_free(&uh_win->active_win);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->local_uh_win) {
        MTCORE_DBG_PRINT("\t free shared window\n");
        mpi_errno = PMPI_Win_free(&uh_win->local_uh_win);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->user_group != MPI_GROUP_NULL) {
        mpi_errno = PMPI_Group_free(&uh_win->user_group);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->ur_h_comm && uh_win->ur_h_comm != MPI_COMM_NULL) {
        MTCORE_DBG_PRINT("\t free user root + helpers communicator\n");
        mpi_errno = PMPI_Comm_free(&uh_win->ur_h_comm);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->local_uh_comm && uh_win->local_uh_comm != MTCORE_COMM_LOCAL) {
        MTCORE_DBG_PRINT("\t free shared communicator\n");
        mpi_errno = PMPI_Comm_free(&uh_win->local_uh_comm);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }
    if (uh_win->local_uh_group != MPI_GROUP_NULL) {
        mpi_errno = PMPI_Group_free(&uh_win->local_uh_group);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->uh_comm != MPI_COMM_NULL && uh_win->uh_comm != MPI_COMM_WORLD) {
        MTCORE_DBG_PRINT("\t free uh communicator\n");
        mpi_errno = PMPI_Comm_free(&uh_win->uh_comm);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }
    if (uh_win->uh_group != MPI_GROUP_NULL) {
        mpi_errno = PMPI_Group_free(&uh_win->uh_group);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->local_user_comm && uh_win->local_user_comm != MTCORE_COMM_USER_LOCAL) {
        MTCORE_DBG_PRINT("\t free local USER communicator\n");
        mpi_errno = PMPI_Comm_free(&uh_win->local_user_comm);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    if (uh_win->user_root_comm && uh_win->user_root_comm != MTCORE_COMM_UR_WORLD) {
        MTCORE_DBG_PRINT("\t free ur communicator\n");
        mpi_errno = PMPI_Comm_free(&uh_win->user_root_comm);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    MTCORE_DBG_PRINT("\t free window cache\n");
    MTCORE_Remove_uh_win_from_cache(*win);

    MTCORE_DBG_PRINT("\t free user window\n");
    mpi_errno = PMPI_Win_free(win);
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    /* free PSCW array in case use does not call complete/wait. */
    if (uh_win->start_ranks_in_win_group)
        free(uh_win->start_ranks_in_win_group);
    if (uh_win->post_ranks_in_win_group)
        free(uh_win->post_ranks_in_win_group);

    /* uh_win->user_comm is created by user, will be freed by user. */

#if defined(MTCORE_ENABLE_RUNTIME_LOAD_OPT)
    if (uh_win->h_ops_counts)
        free(uh_win->h_ops_counts);
    if (uh_win->h_bytes_counts)
        free(uh_win->h_bytes_counts);
#endif

    if (uh_win->targets) {
        for (i = 0; i < user_nprocs; i++) {
            if (uh_win->targets[i].base_h_offsets)
                free(uh_win->targets[i].base_h_offsets);
            if (uh_win->targets[i].h_ranks_in_uh)
                free(uh_win->targets[i].h_ranks_in_uh);
            if (uh_win->targets[i].segs)
                free(uh_win->targets[i].segs);
        }
        free(uh_win->targets);
    }
    if (uh_win->h_ranks_in_uh)
        free(uh_win->h_ranks_in_uh);
    if (uh_win->h_win_handles)
        free(uh_win->h_win_handles);
    if (uh_win->uh_wins)
        free(uh_win->uh_wins);

    free(uh_win);

    MTCORE_DBG_PRINT("Freed MTCORE window 0x%x\n", *win);

  fn_exit:
    if (reqs)
        free(reqs);
    if (stats)
        free(stats);
    return mpi_errno;

  fn_fail:

    goto fn_exit;
}