Beispiel #1
0
static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
{
  PetscErrorCode ierr;

  PetscFunctionBegin;
  if (stash->blocktype == MPI_DATATYPE_NULL) {
    PetscInt     bs2 = PetscSqr(stash->bs);
    PetscMPIInt  blocklens[2];
    MPI_Aint     displs[2];
    MPI_Datatype types[2],stype;
    /* C++ std::complex is not my favorite datatype.  Since it is not POD, we cannot use offsetof to find the offset of
     * vals.  But the layout is actually guaranteed by the standard, so we do a little dance here with struct
     * DummyBlock, substituting PetscReal for PetscComplex so that we can determine the offset.
     */
    struct DummyBlock {PetscInt row,col; PetscReal vals;};

    stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
    if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
      stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
    }
    ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);CHKERRQ(ierr);
    ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);CHKERRQ(ierr);
    ierr = PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);CHKERRQ(ierr);
    blocklens[0] = 2;
    blocklens[1] = bs2;
    displs[0] = offsetof(struct DummyBlock,row);
    displs[1] = offsetof(struct DummyBlock,vals);
    types[0] = MPIU_INT;
    types[1] = MPIU_SCALAR;
    ierr = MPI_Type_create_struct(2,blocklens,displs,types,&stype);CHKERRQ(ierr);
    ierr = MPI_Type_commit(&stype);CHKERRQ(ierr);
    ierr = MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);CHKERRQ(ierr); /* MPI-2 */
    ierr = MPI_Type_commit(&stash->blocktype);CHKERRQ(ierr);
    ierr = MPI_Type_free(&stype);CHKERRQ(ierr);
  }
Beispiel #2
0
static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
{
  PetscErrorCode ierr;
  PetscMPIInt    nrecvs,tag,done,i;
  MPI_Aint       lb,unitbytes;
  char           *tdata;
  MPI_Request    *sendreqs,barrier;
  PetscSegBuffer segrank,segdata;

  PetscFunctionBegin;
  ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
  ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
  if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
  tdata = (char*)todata;
  ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
  for (i=0; i<nto; i++) {
    ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
  }
  ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
  ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);

  nrecvs  = 0;
  barrier = MPI_REQUEST_NULL;
  for (done=0; !done; ) {
    PetscMPIInt flag;
    MPI_Status  status;
    ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
    if (flag) {                 /* incoming message */
      PetscMPIInt *recvrank;
      void        *buf;
      ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
      ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
      *recvrank = status.MPI_SOURCE;
      ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
      nrecvs++;
    }
    if (barrier == MPI_REQUEST_NULL) {
      PetscMPIInt sent,nsends;
      ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
      ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
      if (sent) {
#if defined(PETSC_HAVE_MPI_IBARRIER)
        ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
#elif defined(PETSC_HAVE_MPIX_IBARRIER)
        ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
#endif
        ierr = PetscFree(sendreqs);CHKERRQ(ierr);
      }
    } else {
      ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
    }
  }
  *nfrom = nrecvs;
  ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
  ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
  ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
  ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
  ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Beispiel #3
0
static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscInt nto,const PetscMPIInt *toranks,const void *todata,PetscInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
{
  PetscErrorCode ierr;
  PetscMPIInt    nrecvs,tag,unitbytes,done;
  PetscInt       i;
  char           *tdata;
  MPI_Request    *sendreqs,barrier;
  PetscSegBuffer segrank,segdata;

  PetscFunctionBegin;
  ierr  = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr);
  ierr  = MPI_Type_size(dtype,&unitbytes);CHKERRQ(ierr);
  tdata = (char*)todata;
  ierr  = PetscMalloc(nto*sizeof(MPI_Request),&sendreqs);CHKERRQ(ierr);
  for (i=0; i<nto; i++) {
    ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
  }
  ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
  ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);

  nrecvs  = 0;
  barrier = MPI_REQUEST_NULL;
  for (done=0; !done; ) {
    PetscMPIInt flag;
    MPI_Status  status;
    ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
    if (flag) {                 /* incoming message */
      PetscMPIInt *recvrank;
      void        *buf;
      ierr      = PetscSegBufferGet(&segrank,1,&recvrank);CHKERRQ(ierr);
      ierr      = PetscSegBufferGet(&segdata,count,&buf);CHKERRQ(ierr);
      *recvrank = status.MPI_SOURCE;
      ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
      nrecvs++;
    }
    if (barrier == MPI_REQUEST_NULL) {
      PetscMPIInt sent,nsends;
      ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
      ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
      if (sent) {
        ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
        ierr = PetscFree(sendreqs);CHKERRQ(ierr);
      }
    } else {
      ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
    }
  }
  *nfrom = nrecvs;
  ierr   = PetscSegBufferExtractAlloc(&segrank,fromranks);CHKERRQ(ierr);
  ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
  ierr   = PetscSegBufferExtractAlloc(&segdata,fromdata);CHKERRQ(ierr);
  ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Beispiel #4
0
int main(int argc,char **argv)
{
  PetscErrorCode ierr;
  PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
  PetscInt       i,n;
  PetscBool      verbose,build_twosided_f;
  Unit           *todata,*fromdata;
  MPI_Datatype   dtype;

  PetscInitialize(&argc,&argv,(char*)0,help);
  ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);

  verbose = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-verbose",&verbose,NULL);CHKERRQ(ierr);
  build_twosided_f = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-build_twosided_f",&build_twosided_f,NULL);CHKERRQ(ierr);

  for (i=1,nto=0; i<size; i*=2) nto++;
  ierr = PetscMalloc2(nto,&todata,nto,&toranks);CHKERRQ(ierr);
  for (n=0,i=1; i<size; n++,i*=2) {
    toranks[n] = (rank+i) % size;
    todata[n].rank  = (rank+i) % size;
    todata[n].value = (PetscScalar)rank;
    todata[n].ok[0] = 'o';
    todata[n].ok[1] = 'k';
    todata[n].ok[2] = 0;
  }
  if (verbose) {
    for (i=0; i<nto; i++) {
      ierr = PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] TO %d: {%D, %g, \"%s\"}\n",rank,toranks[i],todata[i].rank,(double)PetscRealPart(todata[i].value),todata[i].ok);CHKERRQ(ierr);
    }
    ierr = PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT);CHKERRQ(ierr);
  }

  ierr = MakeDatatype(&dtype);CHKERRQ(ierr);

  if (build_twosided_f) {
    struct FCtx fctx;
    PetscMPIInt *todummy,*fromdummy;
    fctx.rank    = rank;
    fctx.nto     = nto;
    fctx.toranks = toranks;
    fctx.todata  = todata;
    ierr = PetscSegBufferCreate(sizeof(Unit),1,&fctx.seg);CHKERRQ(ierr);
    ierr = PetscMalloc1(nto,&todummy);CHKERRQ(ierr);
    for (i=0; i<nto; i++) todummy[i] = rank;
    ierr = PetscCommBuildTwoSidedF(PETSC_COMM_WORLD,1,MPI_INT,nto,toranks,todummy,&nfrom,&fromranks,&fromdummy,2,FSend,FRecv,&fctx);CHKERRQ(ierr);
    ierr = PetscFree(todummy);CHKERRQ(ierr);
    ierr = PetscFree(fromdummy);CHKERRQ(ierr);
    ierr = PetscSegBufferExtractAlloc(fctx.seg,&fromdata);CHKERRQ(ierr);
    ierr = PetscSegBufferDestroy(&fctx.seg);CHKERRQ(ierr);
  } else {
    ierr = PetscCommBuildTwoSided(PETSC_COMM_WORLD,1,dtype,nto,toranks,todata,&nfrom,&fromranks,&fromdata);CHKERRQ(ierr);
  }
  ierr = MPI_Type_free(&dtype);CHKERRQ(ierr);

  if (verbose) {
    PetscInt *iranks,*iperm;
    ierr = PetscMalloc2(nfrom,&iranks,nfrom,&iperm);CHKERRQ(ierr);
    for (i=0; i<nfrom; i++) {
      iranks[i] = fromranks[i];
      iperm[i] = i;
    }
    /* Receive ordering is non-deterministic in general, so sort to make verbose output deterministic. */
    ierr = PetscSortIntWithPermutation(nfrom,iranks,iperm);CHKERRQ(ierr);
    for (i=0; i<nfrom; i++) {
      PetscInt ip = iperm[i];
      ierr = PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] FROM %d: {%D, %g, \"%s\"}\n",rank,fromranks[ip],fromdata[ip].rank,(double)PetscRealPart(fromdata[ip].value),fromdata[ip].ok);CHKERRQ(ierr);
    }
    ierr = PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT);CHKERRQ(ierr);
    ierr = PetscFree2(iranks,iperm);CHKERRQ(ierr);
  }

  if (nto != nfrom) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"[%d] From ranks %d does not match To ranks %d",rank,nto,nfrom);
  for (i=1; i<size; i*=2) {
    PetscMPIInt expected_rank = (rank-i+size)%size;
    PetscBool flg;
    for (n=0; n<nfrom; n++) {
      if (expected_rank == fromranks[n]) goto found;
    }
    SETERRQ2(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"[%d] Could not find expected from rank %d",rank,expected_rank);
    found:
    if (PetscRealPart(fromdata[n].value) != expected_rank) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"[%d] Got data %g from rank %d",rank,(double)PetscRealPart(fromdata[n].value),expected_rank);
    ierr = PetscStrcmp(fromdata[n].ok,"ok",&flg);CHKERRQ(ierr);
    if (!flg) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"[%d] Got string %s from rank %d",rank,fromdata[n].ok,expected_rank);
  }
  ierr = PetscFree2(todata,toranks);CHKERRQ(ierr);
  ierr = PetscFree(fromdata);CHKERRQ(ierr);
  ierr = PetscFree(fromranks);CHKERRQ(ierr);
  ierr = PetscFinalize();
  return 0;
}
Beispiel #5
0
static PetscErrorCode VecAssemblyBegin_MPI_BTS(Vec X)
{
  Vec_MPI        *x = (Vec_MPI*)X->data;
  PetscErrorCode ierr;
  MPI_Comm       comm;
  PetscInt       i,j,jb,bs;

  PetscFunctionBegin;
  if (X->stash.donotstash) PetscFunctionReturn(0);

  ierr = PetscObjectGetComm((PetscObject)X,&comm);CHKERRQ(ierr);
  ierr = VecGetBlockSize(X,&bs);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
  {
    InsertMode addv;
    ierr = MPIU_Allreduce((PetscEnum*)&X->stash.insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,comm);CHKERRQ(ierr);
    if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(comm,PETSC_ERR_ARG_NOTSAMETYPE,"Some processors inserted values while others added");
  }
#endif
  X->bstash.insertmode = X->stash.insertmode; /* Block stash implicitly tracks InsertMode of scalar stash */

  ierr = VecStashSortCompress_Private(&X->stash);CHKERRQ(ierr);
  ierr = VecStashSortCompress_Private(&X->bstash);CHKERRQ(ierr);

  if (!x->sendranks) {
    PetscMPIInt nowners,bnowners,*owners,*bowners;
    PetscInt ntmp;
    ierr = VecStashGetOwnerList_Private(&X->stash,X->map,&nowners,&owners);CHKERRQ(ierr);
    ierr = VecStashGetOwnerList_Private(&X->bstash,X->map,&bnowners,&bowners);CHKERRQ(ierr);
    ierr = PetscMergeMPIIntArray(nowners,owners,bnowners,bowners,&ntmp,&x->sendranks);CHKERRQ(ierr);
    x->nsendranks = ntmp;
    ierr = PetscFree(owners);CHKERRQ(ierr);
    ierr = PetscFree(bowners);CHKERRQ(ierr);
    ierr = PetscMalloc1(x->nsendranks,&x->sendhdr);CHKERRQ(ierr);
    ierr = PetscCalloc1(x->nsendranks,&x->sendptrs);CHKERRQ(ierr);
  }
  for (i=0,j=0,jb=0; i<x->nsendranks; i++) {
    PetscMPIInt rank = x->sendranks[i];
    x->sendhdr[i].insertmode = X->stash.insertmode;
    /* Initialize pointers for non-empty stashes the first time around.  Subsequent assemblies with
     * VEC_SUBSET_OFF_PROC_ENTRIES will leave the old pointers (dangling because the stash has been collected) when
     * there is nothing new to send, so that size-zero messages get sent instead. */
    x->sendhdr[i].count = 0;
    if (X->stash.n) {
      x->sendptrs[i].ints    = &X->stash.idx[j];
      x->sendptrs[i].scalars = &X->stash.array[j];
      for ( ; j<X->stash.n && X->stash.idx[j] < X->map->range[rank+1]; j++) x->sendhdr[i].count++;
    }
    x->sendhdr[i].bcount = 0;
    if (X->bstash.n) {
      x->sendptrs[i].intb    = &X->bstash.idx[jb];
      x->sendptrs[i].scalarb = &X->bstash.array[jb*bs];
      for ( ; jb<X->bstash.n && X->bstash.idx[jb]*bs < X->map->range[rank+1]; jb++) x->sendhdr[i].bcount++;
    }
  }

  if (!x->segrecvint) {ierr = PetscSegBufferCreate(sizeof(PetscInt),1000,&x->segrecvint);CHKERRQ(ierr);}
  if (!x->segrecvscalar) {ierr = PetscSegBufferCreate(sizeof(PetscScalar),1000,&x->segrecvscalar);CHKERRQ(ierr);}
  if (!x->segrecvframe) {ierr = PetscSegBufferCreate(sizeof(VecAssemblyFrame),50,&x->segrecvframe);CHKERRQ(ierr);}
  if (x->recvhdr) {             /* VEC_SUBSET_OFF_PROC_ENTRIES and this is not the first assembly */
    PetscMPIInt tag[4];
    if (!x->assembly_subset) SETERRQ(comm,PETSC_ERR_PLIB,"Attempt to reuse rendezvous when not VEC_SUBSET_OFF_PROC_ENTRIES");
    for (i=0; i<4; i++) {ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);}
    for (i=0; i<x->nsendranks; i++) {
      ierr = VecAssemblySend_MPI_Private(comm,tag,i,x->sendranks[i],x->sendhdr+i,x->sendreqs+4*i,X);CHKERRQ(ierr);
    }
    for (i=0; i<x->nrecvranks; i++) {
      ierr = VecAssemblyRecv_MPI_Private(comm,tag,x->recvranks[i],x->recvhdr+i,x->recvreqs+4*i,X);CHKERRQ(ierr);
    }
    x->use_status = PETSC_TRUE;
  } else {                      /* First time */
    ierr = PetscCommBuildTwoSidedFReq(comm,3,MPIU_INT,x->nsendranks,x->sendranks,(PetscInt*)x->sendhdr,&x->nrecvranks,&x->recvranks,&x->recvhdr,4,&x->sendreqs,&x->recvreqs,VecAssemblySend_MPI_Private,VecAssemblyRecv_MPI_Private,X);CHKERRQ(ierr);
    x->use_status = PETSC_FALSE;
  }

  {
    PetscInt nstash,reallocs;
    ierr = VecStashGetInfo_Private(&X->stash,&nstash,&reallocs);CHKERRQ(ierr);
    ierr = PetscInfo2(X,"Stash has %D entries, uses %D mallocs.\n",nstash,reallocs);CHKERRQ(ierr);
    ierr = VecStashGetInfo_Private(&X->bstash,&nstash,&reallocs);CHKERRQ(ierr);
    ierr = PetscInfo2(X,"Block-Stash has %D entries, uses %D mallocs.\n",nstash,reallocs);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
Beispiel #6
0
static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
                                                          PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
                                                          PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
                                                          PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
{
  PetscErrorCode ierr;
  PetscMPIInt    nrecvs,tag,*tags,done,i;
  MPI_Aint       lb,unitbytes;
  char           *tdata;
  MPI_Request    *sendreqs,*usendreqs,*req,barrier;
  PetscSegBuffer segrank,segdata,segreq;
  PetscBool      barrier_started;

  PetscFunctionBegin;
  ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
  ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
  for (i=0; i<ntags; i++) {
    ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
  }
  ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
  if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
  tdata = (char*)todata;
  ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
  ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
  /* Post synchronous sends */
  for (i=0; i<nto; i++) {
    ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
  }
  /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
   * synchronous messages above. */
  for (i=0; i<nto; i++) {
    PetscMPIInt k;
    for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
    ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
  }

  ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
  ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
  ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);

  nrecvs  = 0;
  barrier = MPI_REQUEST_NULL;
  /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
   * but we need to work around it. */
  barrier_started = PETSC_FALSE;
  for (done=0; !done; ) {
    PetscMPIInt flag;
    MPI_Status  status;
    ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
    if (flag) {                 /* incoming message */
      PetscMPIInt *recvrank,k;
      void        *buf;
      ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
      ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
      *recvrank = status.MPI_SOURCE;
      ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
      ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
      for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
      ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
      nrecvs++;
    }
    if (!barrier_started) {
      PetscMPIInt sent,nsends;
      ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
      ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
      if (sent) {
#if defined(PETSC_HAVE_MPI_IBARRIER)
        ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
#elif defined(PETSC_HAVE_MPIX_IBARRIER)
        ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
#endif
        barrier_started = PETSC_TRUE;
      }
    } else {
      ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
    }
  }
  *nfrom = nrecvs;
  ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
  ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
  ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
  ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
  *toreqs = usendreqs;
  ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
  ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
  ierr = PetscFree(sendreqs);CHKERRQ(ierr);
  ierr = PetscFree(tags);CHKERRQ(ierr);
  ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}