PetscErrorCode AOCreateMemoryScalable_private(MPI_Comm comm,PetscInt napp,const PetscInt from_array[],const PetscInt to_array[],AO ao, PetscInt *aomap_loc) { PetscErrorCode ierr; AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data; PetscLayout map = aomems->map; PetscInt n_local = map->n,i,j; PetscMPIInt rank,size,tag; PetscInt *owner,*start,*nprocs,nsends,nreceives; PetscInt nmax,count,*sindices,*rindices,idx,lastidx; PetscInt *owners = aomems->map->range; MPI_Request *send_waits,*recv_waits; MPI_Status recv_status; PetscMPIInt nindices,widx; PetscInt *rbuf; PetscInt n=napp,ip,ia; MPI_Status *send_status; PetscFunctionBegin; ierr = PetscMemzero(aomap_loc,n_local*sizeof(PetscInt)); CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank); CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size); CHKERRQ(ierr); /* first count number of contributors (of from_array[]) to each processor */ ierr = PetscMalloc(2*size*sizeof(PetscInt),&nprocs); CHKERRQ(ierr); ierr = PetscMemzero(nprocs,2*size*sizeof(PetscInt)); CHKERRQ(ierr); ierr = PetscMalloc(n*sizeof(PetscInt),&owner); CHKERRQ(ierr); j = 0; lastidx = -1; for (i=0; i<n; i++) { /* if indices are NOT locally sorted, need to start search at the beginning */ if (lastidx > (idx = from_array[i])) j = 0; lastidx = idx; for (; j<size; j++) { if (idx >= owners[j] && idx < owners[j+1]) { nprocs[2*j] += 2; /* num of indices to be sent - in pairs (ip,ia) */ nprocs[2*j+1] = 1; /* send to proc[j] */ owner[i] = j; break; } } } nprocs[2*rank]=nprocs[2*rank+1]=0; /* do not receive from self! */ nsends = 0; for (i=0; i<size; i++) nsends += nprocs[2*i+1]; /* inform other processors of number of messages and max length*/ ierr = PetscMaxSum(comm,nprocs,&nmax,&nreceives); CHKERRQ(ierr); /* allocate arrays */ ierr = PetscObjectGetNewTag((PetscObject)ao,&tag); CHKERRQ(ierr); ierr = PetscMalloc2(nreceives*nmax,PetscInt,&rindices,nreceives,MPI_Request,&recv_waits); CHKERRQ(ierr); ierr = PetscMalloc3(2*n,PetscInt,&sindices,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status); CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt),&start); CHKERRQ(ierr); /* post receives: */ for (i=0; i<nreceives; i++) { ierr = MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i); CHKERRQ(ierr); } /* do sends: 1) starts[i] gives the starting index in svalues for stuff going to the ith processor */ start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0; i<n; i++) { j = owner[i]; if (j != rank) { ip = from_array[i]; ia = to_array[i]; sindices[start[j]++] = ip; sindices[start[j]++] = ia; } else { /* compute my own map */ ip = from_array[i] - owners[rank]; ia = to_array[i]; aomap_loc[ip] = ia; } } start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0,count=0; i<size; i++) { if (nprocs[2*i+1]) { ierr = MPI_Isend(sindices+start[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count); CHKERRQ(ierr); count++; } } if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count); /* wait on sends */ if (nsends) { ierr = MPI_Waitall(nsends,send_waits,send_status); CHKERRQ(ierr); } /* recvs */ count=0; for (j= nreceives; j>0; j--) { ierr = MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); CHKERRQ(ierr); ierr = MPI_Get_count(&recv_status,MPIU_INT,&nindices); CHKERRQ(ierr); rbuf = rindices+nmax*widx; /* global index */ /* compute local mapping */ for (i=0; i<nindices; i+=2) { /* pack aomap_loc */ ip = rbuf[i] - owners[rank]; /* local index */ ia = rbuf[i+1]; aomap_loc[ip] = ia; } count++; } ierr = PetscFree(start); CHKERRQ(ierr); ierr = PetscFree3(sindices,send_waits,send_status); CHKERRQ(ierr); ierr = PetscFree2(rindices,recv_waits); CHKERRQ(ierr); ierr = PetscFree(owner); CHKERRQ(ierr); ierr = PetscFree(nprocs); CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode AOMap_MemoryScalable_private(AO ao,PetscInt n,PetscInt *ia,PetscInt *maploc) { PetscErrorCode ierr; AO_MemoryScalable *aomems = (AO_MemoryScalable*)ao->data; MPI_Comm comm; PetscMPIInt rank,size,tag1,tag2; PetscInt *owner,*start,*nprocs,nsends,nreceives; PetscInt nmax,count,*sindices,*rindices,i,j,idx,lastidx,*sindices2,*rindices2; PetscInt *owners = aomems->map->range; MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2; MPI_Status recv_status; PetscMPIInt nindices,source,widx; PetscInt *rbuf,*sbuf; MPI_Status *send_status,*send_status2; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)ao,&comm); CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank); CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size); CHKERRQ(ierr); /* first count number of contributors to each processor */ ierr = PetscMalloc2(2*size,PetscInt,&nprocs,size,PetscInt,&start); CHKERRQ(ierr); ierr = PetscMemzero(nprocs,2*size*sizeof(PetscInt)); CHKERRQ(ierr); ierr = PetscMalloc(n*sizeof(PetscInt),&owner); CHKERRQ(ierr); ierr = PetscMemzero(owner,n*sizeof(PetscInt)); CHKERRQ(ierr); j = 0; lastidx = -1; for (i=0; i<n; i++) { /* if indices are NOT locally sorted, need to start search at the beginning */ if (lastidx > (idx = ia[i])) j = 0; lastidx = idx; for (; j<size; j++) { if (idx >= owners[j] && idx < owners[j+1]) { nprocs[2*j]++; /* num of indices to be sent */ nprocs[2*j+1] = 1; /* send to proc[j] */ owner[i] = j; break; } } } nprocs[2*rank]=nprocs[2*rank+1]=0; /* do not receive from self! */ nsends = 0; for (i=0; i<size; i++) nsends += nprocs[2*i+1]; /* inform other processors of number of messages and max length*/ ierr = PetscMaxSum(comm,nprocs,&nmax,&nreceives); CHKERRQ(ierr); /* allocate arrays */ ierr = PetscObjectGetNewTag((PetscObject)ao,&tag1); CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)ao,&tag2); CHKERRQ(ierr); ierr = PetscMalloc2(nreceives*nmax,PetscInt,&rindices,nreceives,MPI_Request,&recv_waits); CHKERRQ(ierr); ierr = PetscMalloc2(nsends*nmax,PetscInt,&rindices2,nsends,MPI_Request,&recv_waits2); CHKERRQ(ierr); ierr = PetscMalloc3(n,PetscInt,&sindices,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status); CHKERRQ(ierr); ierr = PetscMalloc3(n,PetscInt,&sindices2,nreceives,MPI_Request,&send_waits2,nreceives,MPI_Status,&send_status2); CHKERRQ(ierr); /* post 1st receives: receive others requests since we don't know how long each individual message is we allocate the largest needed buffer for each receive. Potentially this is a lot of wasted space. */ for (i=0,count=0; i<nreceives; i++) { ierr = MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++); CHKERRQ(ierr); } /* do 1st sends: 1) starts[i] gives the starting index in svalues for stuff going to the ith processor */ start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0; i<n; i++) { j = owner[i]; if (j != rank) { sindices[start[j]++] = ia[i]; } else { /* compute my own map */ if (ia[i] >= owners[rank] && ia[i] < owners[rank+1]) { ia[i] = maploc[ia[i]-owners[rank]]; } else { ia[i] = -1; /* ia[i] is not in the range of 0 and N-1, maps it to -1 */ } } } start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0,count=0; i<size; i++) { if (nprocs[2*i+1]) { /* send my request to others */ ierr = MPI_Isend(sindices+start[i],nprocs[2*i],MPIU_INT,i,tag1,comm,send_waits+count); CHKERRQ(ierr); /* post receive for the answer of my request */ ierr = MPI_Irecv(sindices2+start[i],nprocs[2*i],MPIU_INT,i,tag2,comm,recv_waits2+count); CHKERRQ(ierr); count++; } } if (nsends != count) SETERRQ2(comm,PETSC_ERR_SUP,"nsends %d != count %d",nsends,count); /* wait on 1st sends */ if (nsends) { ierr = MPI_Waitall(nsends,send_waits,send_status); CHKERRQ(ierr); } /* 1st recvs: other's requests */ for (j=0; j< nreceives; j++) { ierr = MPI_Waitany(nreceives,recv_waits,&widx,&recv_status); CHKERRQ(ierr); /* idx: index of handle for operation that completed */ ierr = MPI_Get_count(&recv_status,MPIU_INT,&nindices); CHKERRQ(ierr); rbuf = rindices+nmax*widx; /* global index */ source = recv_status.MPI_SOURCE; /* compute mapping */ sbuf = rbuf; for (i=0; i<nindices; i++) sbuf[i] = maploc[rbuf[i]-owners[rank]]; /* send mapping back to the sender */ ierr = MPI_Isend(sbuf,nindices,MPIU_INT,source,tag2,comm,send_waits2+widx); CHKERRQ(ierr); } /* wait on 2nd sends */ if (nreceives) { ierr = MPI_Waitall(nreceives,send_waits2,send_status2); CHKERRQ(ierr); } /* 2nd recvs: for the answer of my request */ for (j=0; j< nsends; j++) { ierr = MPI_Waitany(nsends,recv_waits2,&widx,&recv_status); CHKERRQ(ierr); ierr = MPI_Get_count(&recv_status,MPIU_INT,&nindices); CHKERRQ(ierr); source = recv_status.MPI_SOURCE; /* pack output ia[] */ rbuf = sindices2+start[source]; count = 0; for (i=0; i<n; i++) { if (source == owner[i]) ia[i] = rbuf[count++]; } } /* free arrays */ ierr = PetscFree2(nprocs,start); CHKERRQ(ierr); ierr = PetscFree(owner); CHKERRQ(ierr); ierr = PetscFree2(rindices,recv_waits); CHKERRQ(ierr); ierr = PetscFree2(rindices2,recv_waits2); CHKERRQ(ierr); ierr = PetscFree3(sindices,send_waits,send_status); CHKERRQ(ierr); ierr = PetscFree3(sindices2,send_waits2,send_status2); CHKERRQ(ierr); PetscFunctionReturn(0); }
/*@C ISLocalToGlobalMappingGetInfo - Gets the neighbor information for each processor and each index shared by more than one processor Collective on ISLocalToGlobalMapping Input Parameters: . mapping - the mapping from local to global indexing Output Parameter: + nproc - number of processors that are connected to this one . proc - neighboring processors . numproc - number of indices for each subdomain (processor) - indices - indices of nodes (in local numbering) shared with neighbors (sorted by global numbering) Level: advanced Concepts: mapping^local to global Fortran Usage: $ ISLocalToGlobalMpngGetInfoSize(ISLocalToGlobalMapping,PetscInt nproc,PetscInt numprocmax,ierr) followed by $ ISLocalToGlobalMappingGetInfo(ISLocalToGlobalMapping,PetscInt nproc, PetscInt procs[nproc],PetscInt numprocs[nproc], PetscInt indices[nproc][numprocmax],ierr) There is no ISLocalToGlobalMappingRestoreInfo() in Fortran. You must make sure that procs[], numprocs[] and indices[][] are large enough arrays, either by allocating them dynamically or defining static ones large enough. .seealso: ISLocalToGlobalMappingDestroy(), ISLocalToGlobalMappingCreateIS(), ISLocalToGlobalMappingCreate(), ISLocalToGlobalMappingRestoreInfo() @*/ PetscErrorCode PETSCVEC_DLLEXPORT ISLocalToGlobalMappingGetInfo(ISLocalToGlobalMapping mapping,PetscInt *nproc,PetscInt *procs[],PetscInt *numprocs[],PetscInt **indices[]) { PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2,tag3,*len,*source,imdex; PetscInt i,n = mapping->n,Ng,ng,max = 0,*lindices = mapping->indices; PetscInt *nprocs,*owner,nsends,*sends,j,*starts,nmax,nrecvs,*recvs,proc; PetscInt cnt,scale,*ownedsenders,*nownedsenders,rstart,nowned; PetscInt node,nownedm,nt,*sends2,nsends2,*starts2,*lens2,*dest,nrecvs2,*starts3,*recvs2,k,*bprocs,*tmp; PetscInt first_procs,first_numprocs,*first_indices; MPI_Request *recv_waits,*send_waits; MPI_Status recv_status,*send_status,*recv_statuses; MPI_Comm comm = ((PetscObject)mapping)->comm; PetscTruth debug = PETSC_FALSE; PetscFunctionBegin; PetscValidHeaderSpecific(mapping,IS_LTOGM_COOKIE,1); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); if (size == 1) { *nproc = 0; *procs = PETSC_NULL; ierr = PetscMalloc(sizeof(PetscInt),numprocs);CHKERRQ(ierr); (*numprocs)[0] = 0; ierr = PetscMalloc(sizeof(PetscInt*),indices);CHKERRQ(ierr); (*indices)[0] = PETSC_NULL; PetscFunctionReturn(0); } ierr = PetscOptionsGetTruth(PETSC_NULL,"-islocaltoglobalmappinggetinfo_debug",&debug,PETSC_NULL);CHKERRQ(ierr); /* Notes on ISLocalToGlobalMappingGetInfo globally owned node - the nodes that have been assigned to this processor in global numbering, just for this routine. nontrivial globally owned node - node assigned to this processor that is on a subdomain boundary (i.e. is has more than one local owner) locally owned node - node that exists on this processors subdomain nontrivial locally owned node - node that is not in the interior (i.e. has more than one local subdomain */ ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag2);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)mapping,&tag3);CHKERRQ(ierr); for (i=0; i<n; i++) { if (lindices[i] > max) max = lindices[i]; } ierr = MPI_Allreduce(&max,&Ng,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); Ng++; ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); scale = Ng/size + 1; ng = scale; if (rank == size-1) ng = Ng - scale*(size-1); ng = PetscMax(1,ng); rstart = scale*rank; /* determine ownership ranges of global indices */ ierr = PetscMalloc(2*size*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); ierr = PetscMemzero(nprocs,2*size*sizeof(PetscInt));CHKERRQ(ierr); /* determine owners of each local node */ ierr = PetscMalloc(n*sizeof(PetscInt),&owner);CHKERRQ(ierr); for (i=0; i<n; i++) { proc = lindices[i]/scale; /* processor that globally owns this index */ nprocs[2*proc+1] = 1; /* processor globally owns at least one of ours */ owner[i] = proc; nprocs[2*proc]++; /* count of how many that processor globally owns of ours */ } nsends = 0; for (i=0; i<size; i++) nsends += nprocs[2*i+1]; ierr = PetscInfo1(mapping,"Number of global owners for my local data %d\n",nsends);CHKERRQ(ierr); /* inform other processors of number of messages and max length*/ ierr = PetscMaxSum(comm,nprocs,&nmax,&nrecvs);CHKERRQ(ierr); ierr = PetscInfo1(mapping,"Number of local owners for my global data %d\n",nrecvs);CHKERRQ(ierr); /* post receives for owned rows */ ierr = PetscMalloc((2*nrecvs+1)*(nmax+1)*sizeof(PetscInt),&recvs);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs; i++) { ierr = MPI_Irecv(recvs+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,recv_waits+i);CHKERRQ(ierr); } /* pack messages containing lists of local nodes to owners */ ierr = PetscMalloc((2*n+1)*sizeof(PetscInt),&sends);CHKERRQ(ierr); ierr = PetscMalloc((size+1)*sizeof(PetscInt),&starts);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + 2*nprocs[2*i-2];} for (i=0; i<n; i++) { sends[starts[owner[i]]++] = lindices[i]; sends[starts[owner[i]]++] = i; } ierr = PetscFree(owner);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<size; i++) { starts[i] = starts[i-1] + 2*nprocs[2*i-2];} /* send the messages */ ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr); ierr = PetscMalloc((nsends+1)*sizeof(PetscInt),&dest);CHKERRQ(ierr); cnt = 0; for (i=0; i<size; i++) { if (nprocs[2*i]) { ierr = MPI_Isend(sends+starts[i],2*nprocs[2*i],MPIU_INT,i,tag1,comm,send_waits+cnt);CHKERRQ(ierr); dest[cnt] = i; cnt++; } } ierr = PetscFree(starts);CHKERRQ(ierr); /* wait on receives */ ierr = PetscMalloc((nrecvs+1)*sizeof(PetscMPIInt),&source);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs+1)*sizeof(PetscMPIInt),&len);CHKERRQ(ierr); cnt = nrecvs; ierr = PetscMalloc((ng+1)*sizeof(PetscInt),&nownedsenders);CHKERRQ(ierr); ierr = PetscMemzero(nownedsenders,ng*sizeof(PetscInt));CHKERRQ(ierr); while (cnt) { ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr); /* unpack receives into our local space */ ierr = MPI_Get_count(&recv_status,MPIU_INT,&len[imdex]);CHKERRQ(ierr); source[imdex] = recv_status.MPI_SOURCE; len[imdex] = len[imdex]/2; /* count how many local owners for each of my global owned indices */ for (i=0; i<len[imdex]; i++) nownedsenders[recvs[2*imdex*nmax+2*i]-rstart]++; cnt--; } ierr = PetscFree(recv_waits);CHKERRQ(ierr); /* count how many globally owned indices are on an edge multiplied by how many processors own them. */ nowned = 0; nownedm = 0; for (i=0; i<ng; i++) { if (nownedsenders[i] > 1) {nownedm += nownedsenders[i]; nowned++;} } /* create single array to contain rank of all local owners of each globally owned index */ ierr = PetscMalloc((nownedm+1)*sizeof(PetscInt),&ownedsenders);CHKERRQ(ierr); ierr = PetscMalloc((ng+1)*sizeof(PetscInt),&starts);CHKERRQ(ierr); starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } /* for each nontrival globally owned node list all arriving processors */ for (i=0; i<nrecvs; i++) { for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { ownedsenders[starts[node]++] = source[i]; } } } if (debug) { /* ----------------------------------- */ starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } for (i=0; i<ng; i++) { if (nownedsenders[i] > 1) { ierr = PetscSynchronizedPrintf(comm,"[%d] global node %d local owner processors: ",rank,i+rstart);CHKERRQ(ierr); for (j=0; j<nownedsenders[i]; j++) { ierr = PetscSynchronizedPrintf(comm,"%d ",ownedsenders[starts[i]+j]);CHKERRQ(ierr); } ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); }/* ----------------------------------- */ /* wait on original sends */ if (nsends) { ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr); ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr); ierr = PetscFree(send_status);CHKERRQ(ierr); } ierr = PetscFree(send_waits);CHKERRQ(ierr); ierr = PetscFree(sends);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); /* pack messages to send back to local owners */ starts[0] = 0; for (i=1; i<ng; i++) { if (nownedsenders[i-1] > 1) starts[i] = starts[i-1] + nownedsenders[i-1]; else starts[i] = starts[i-1]; } nsends2 = nrecvs; ierr = PetscMalloc((nsends2+1)*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); /* length of each message */ for (i=0; i<nrecvs; i++) { nprocs[i] = 1; for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { nprocs[i] += 2 + nownedsenders[node]; } } } nt = 0; for (i=0; i<nsends2; i++) nt += nprocs[i]; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),&sends2);CHKERRQ(ierr); ierr = PetscMalloc((nsends2+1)*sizeof(PetscInt),&starts2);CHKERRQ(ierr); starts2[0] = 0; for (i=1; i<nsends2; i++) starts2[i] = starts2[i-1] + nprocs[i-1]; /* Each message is 1 + nprocs[i] long, and consists of (0) the number of nodes being sent back (1) the local node number, (2) the number of processors sharing it, (3) the processors sharing it */ for (i=0; i<nsends2; i++) { cnt = 1; sends2[starts2[i]] = 0; for (j=0; j<len[i]; j++) { node = recvs[2*i*nmax+2*j]-rstart; if (nownedsenders[node] > 1) { sends2[starts2[i]]++; sends2[starts2[i]+cnt++] = recvs[2*i*nmax+2*j+1]; sends2[starts2[i]+cnt++] = nownedsenders[node]; ierr = PetscMemcpy(&sends2[starts2[i]+cnt],&ownedsenders[starts[node]],nownedsenders[node]*sizeof(PetscInt));CHKERRQ(ierr); cnt += nownedsenders[node]; } } } /* receive the message lengths */ nrecvs2 = nsends; ierr = PetscMalloc((nrecvs2+1)*sizeof(PetscInt),&lens2);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(PetscInt),&starts3);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs2; i++) { ierr = MPI_Irecv(&lens2[i],1,MPIU_INT,dest[i],tag2,comm,recv_waits+i);CHKERRQ(ierr); } /* send the message lengths */ for (i=0; i<nsends2; i++) { ierr = MPI_Send(&nprocs[i],1,MPIU_INT,source[i],tag2,comm);CHKERRQ(ierr); } /* wait on receives of lens */ if (nrecvs2) { ierr = PetscMalloc(nrecvs2*sizeof(MPI_Status),&recv_statuses);CHKERRQ(ierr); ierr = MPI_Waitall(nrecvs2,recv_waits,recv_statuses);CHKERRQ(ierr); ierr = PetscFree(recv_statuses);CHKERRQ(ierr); } ierr = PetscFree(recv_waits); starts3[0] = 0; nt = 0; for (i=0; i<nrecvs2-1; i++) { starts3[i+1] = starts3[i] + lens2[i]; nt += lens2[i]; } nt += lens2[nrecvs2-1]; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),&recvs2);CHKERRQ(ierr); ierr = PetscMalloc((nrecvs2+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr); for (i=0; i<nrecvs2; i++) { ierr = MPI_Irecv(recvs2+starts3[i],lens2[i],MPIU_INT,dest[i],tag3,comm,recv_waits+i);CHKERRQ(ierr); } /* send the messages */ ierr = PetscMalloc((nsends2+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr); for (i=0; i<nsends2; i++) { ierr = MPI_Isend(sends2+starts2[i],nprocs[i],MPIU_INT,source[i],tag3,comm,send_waits+i);CHKERRQ(ierr); } /* wait on receives */ if (nrecvs2) { ierr = PetscMalloc(nrecvs2*sizeof(MPI_Status),&recv_statuses);CHKERRQ(ierr); ierr = MPI_Waitall(nrecvs2,recv_waits,recv_statuses);CHKERRQ(ierr); ierr = PetscFree(recv_statuses);CHKERRQ(ierr); } ierr = PetscFree(recv_waits);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); if (debug) { /* ----------------------------------- */ cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { ierr = PetscSynchronizedPrintf(comm,"[%d] local node %d number of subdomains %d: ",rank,recvs2[cnt],recvs2[cnt+1]);CHKERRQ(ierr); for (k=0; k<recvs2[cnt+1]; k++) { ierr = PetscSynchronizedPrintf(comm,"%d ",recvs2[cnt+2+k]);CHKERRQ(ierr); } cnt += 2 + recvs2[cnt+1]; ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); } /* ----------------------------------- */ /* count number subdomains for each local node */ ierr = PetscMalloc(size*sizeof(PetscInt),&nprocs);CHKERRQ(ierr); ierr = PetscMemzero(nprocs,size*sizeof(PetscInt));CHKERRQ(ierr); cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { for (k=0; k<recvs2[cnt+1]; k++) { nprocs[recvs2[cnt+2+k]]++; } cnt += 2 + recvs2[cnt+1]; } } nt = 0; for (i=0; i<size; i++) nt += (nprocs[i] > 0); *nproc = nt; ierr = PetscMalloc((nt+1)*sizeof(PetscInt),procs);CHKERRQ(ierr); ierr = PetscMalloc((nt+1)*sizeof(PetscInt),numprocs);CHKERRQ(ierr); ierr = PetscMalloc((nt+1)*sizeof(PetscInt*),indices);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt),&bprocs);CHKERRQ(ierr); cnt = 0; for (i=0; i<size; i++) { if (nprocs[i] > 0) { bprocs[i] = cnt; (*procs)[cnt] = i; (*numprocs)[cnt] = nprocs[i]; ierr = PetscMalloc(nprocs[i]*sizeof(PetscInt),&(*indices)[cnt]);CHKERRQ(ierr); cnt++; } } /* make the list of subdomains for each nontrivial local node */ ierr = PetscMemzero(*numprocs,nt*sizeof(PetscInt));CHKERRQ(ierr); cnt = 0; for (i=0; i<nrecvs2; i++) { nt = recvs2[cnt++]; for (j=0; j<nt; j++) { for (k=0; k<recvs2[cnt+1]; k++) { (*indices)[bprocs[recvs2[cnt+2+k]]][(*numprocs)[bprocs[recvs2[cnt+2+k]]]++] = recvs2[cnt]; } cnt += 2 + recvs2[cnt+1]; } } ierr = PetscFree(bprocs);CHKERRQ(ierr); ierr = PetscFree(recvs2);CHKERRQ(ierr); /* sort the node indexing by their global numbers */ nt = *nproc; for (i=0; i<nt; i++) { ierr = PetscMalloc(((*numprocs)[i])*sizeof(PetscInt),&tmp);CHKERRQ(ierr); for (j=0; j<(*numprocs)[i]; j++) { tmp[j] = lindices[(*indices)[i][j]]; } ierr = PetscSortIntWithArray((*numprocs)[i],tmp,(*indices)[i]);CHKERRQ(ierr); ierr = PetscFree(tmp);CHKERRQ(ierr); } if (debug) { /* ----------------------------------- */ nt = *nproc; for (i=0; i<nt; i++) { ierr = PetscSynchronizedPrintf(comm,"[%d] subdomain %d number of indices %d: ",rank,(*procs)[i],(*numprocs)[i]);CHKERRQ(ierr); for (j=0; j<(*numprocs)[i]; j++) { ierr = PetscSynchronizedPrintf(comm,"%d ",(*indices)[i][j]);CHKERRQ(ierr); } ierr = PetscSynchronizedPrintf(comm,"\n");CHKERRQ(ierr); } ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr); } /* ----------------------------------- */ /* wait on sends */ if (nsends2) { ierr = PetscMalloc(nsends2*sizeof(MPI_Status),&send_status);CHKERRQ(ierr); ierr = MPI_Waitall(nsends2,send_waits,send_status);CHKERRQ(ierr); ierr = PetscFree(send_status);CHKERRQ(ierr); } ierr = PetscFree(starts3);CHKERRQ(ierr); ierr = PetscFree(dest);CHKERRQ(ierr); ierr = PetscFree(send_waits);CHKERRQ(ierr); ierr = PetscFree(nownedsenders);CHKERRQ(ierr); ierr = PetscFree(ownedsenders);CHKERRQ(ierr); ierr = PetscFree(starts);CHKERRQ(ierr); ierr = PetscFree(starts2);CHKERRQ(ierr); ierr = PetscFree(lens2);CHKERRQ(ierr); ierr = PetscFree(source);CHKERRQ(ierr); ierr = PetscFree(len);CHKERRQ(ierr); ierr = PetscFree(recvs);CHKERRQ(ierr); ierr = PetscFree(nprocs);CHKERRQ(ierr); ierr = PetscFree(sends2);CHKERRQ(ierr); /* put the information about myself as the first entry in the list */ first_procs = (*procs)[0]; first_numprocs = (*numprocs)[0]; first_indices = (*indices)[0]; for (i=0; i<*nproc; i++) { if ((*procs)[i] == rank) { (*procs)[0] = (*procs)[i]; (*numprocs)[0] = (*numprocs)[i]; (*indices)[0] = (*indices)[i]; (*procs)[i] = first_procs; (*numprocs)[i] = first_numprocs; (*indices)[i] = first_indices; break; } } PetscFunctionReturn(0); }
PetscErrorCode VecStashScatterBegin_Private(VecStash *stash,PetscInt *owners) { PetscErrorCode ierr; PetscMPIInt size = stash->size,tag1=stash->tag1,tag2=stash->tag2; PetscInt *owner,*start,*nprocs,nsends,nreceives; PetscInt nmax,count,*sindices,*rindices,i,j,idx,bs=stash->bs,lastidx; PetscScalar *rvalues,*svalues; MPI_Comm comm = stash->comm; MPI_Request *send_waits,*recv_waits; PetscFunctionBegin; /* first count number of contributors to each processor */ ierr = PetscCalloc1(2*size,&nprocs);CHKERRQ(ierr); ierr = PetscMalloc1(stash->n,&owner);CHKERRQ(ierr); j = 0; lastidx = -1; for (i=0; i<stash->n; i++) { /* if indices are NOT locally sorted, need to start search at the beginning */ if (lastidx > (idx = stash->idx[i])) j = 0; lastidx = idx; for (; j<size; j++) { if (idx >= owners[j] && idx < owners[j+1]) { nprocs[2*j]++; nprocs[2*j+1] = 1; owner[i] = j; break; } } } nsends = 0; for (i=0; i<size; i++) nsends += nprocs[2*i+1]; /* inform other processors of number of messages and max length*/ ierr = PetscMaxSum(comm,nprocs,&nmax,&nreceives);CHKERRQ(ierr); /* post receives: since we don't know how long each individual message is we allocate the largest needed buffer for each receive. Potentially this is a lot of wasted space. */ ierr = PetscMalloc2(nreceives*nmax*bs,&rvalues,nreceives*nmax,&rindices);CHKERRQ(ierr); ierr = PetscMalloc1(2*nreceives,&recv_waits);CHKERRQ(ierr); for (i=0,count=0; i<nreceives; i++) { ierr = MPI_Irecv(rvalues+bs*nmax*i,bs*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag1,comm,recv_waits+count++);CHKERRQ(ierr); ierr = MPI_Irecv(rindices+nmax*i,nmax,MPIU_INT,MPI_ANY_SOURCE,tag2,comm,recv_waits+count++);CHKERRQ(ierr); } /* do sends: 1) starts[i] gives the starting index in svalues for stuff going to the ith processor */ ierr = PetscMalloc2(stash->n*bs,&svalues,stash->n,&sindices);CHKERRQ(ierr); ierr = PetscMalloc1(2*nsends,&send_waits);CHKERRQ(ierr); ierr = PetscMalloc1(size,&start);CHKERRQ(ierr); /* use 2 sends the first with all_v, the next with all_i */ start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0; i<stash->n; i++) { j = owner[i]; if (bs == 1) svalues[start[j]] = stash->array[i]; else { ierr = PetscMemcpy(svalues+bs*start[j],stash->array+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); } sindices[start[j]] = stash->idx[i]; start[j]++; } start[0] = 0; for (i=1; i<size; i++) start[i] = start[i-1] + nprocs[2*i-2]; for (i=0,count=0; i<size; i++) { if (nprocs[2*i+1]) { ierr = MPI_Isend(svalues+bs*start[i],bs*nprocs[2*i],MPIU_SCALAR,i,tag1,comm,send_waits+count++);CHKERRQ(ierr); ierr = MPI_Isend(sindices+start[i],nprocs[2*i],MPIU_INT,i,tag2,comm,send_waits+count++);CHKERRQ(ierr); } } ierr = PetscFree(owner);CHKERRQ(ierr); ierr = PetscFree(start);CHKERRQ(ierr); /* This memory is reused in scatter end for a different purpose*/ for (i=0; i<2*size; i++) nprocs[i] = -1; stash->nprocs = nprocs; stash->svalues = svalues; stash->sindices = sindices; stash->rvalues = rvalues; stash->rindices = rindices; stash->nsends = nsends; stash->nrecvs = nreceives; stash->send_waits = send_waits; stash->recv_waits = recv_waits; stash->rmax = nmax; PetscFunctionReturn(0); }
PetscErrorCode MatGetSubMatrices_MPIDense_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) { Mat_MPIDense *c = (Mat_MPIDense*)C->data; Mat A = c->A; Mat_SeqDense *a = (Mat_SeqDense*)A->data,*mat; PetscErrorCode ierr; PetscMPIInt rank,size,tag0,tag1,idex,end,i; PetscInt N = C->cmap->N,rstart = C->rmap->rstart,count; const PetscInt **irow,**icol,*irow_i; PetscInt *nrow,*ncol,*w1,*w3,*w4,*rtable,start; PetscInt **sbuf1,m,j,k,l,ct1,**rbuf1,row,proc; PetscInt nrqs,msz,**ptr,*ctr,*pa,*tmp,bsz,nrqr; PetscInt is_no,jmax,**rmap,*rmap_i; PetscInt ctr_j,*sbuf1_j,*rbuf1_i; MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; MPI_Status *r_status1,*r_status2,*s_status1,*s_status2; MPI_Comm comm; PetscScalar **rbuf2,**sbuf2; PetscBool sorted; PetscFunctionBegin; comm = ((PetscObject)C)->comm; tag0 = ((PetscObject)C)->tag; size = c->size; rank = c->rank; m = C->rmap->N; /* Get some new tags to keep the communication clean */ ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); /* Check if the col indices are sorted */ for (i=0; i<ismax; i++) { ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted"); ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); } ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,m,PetscInt,&rtable);CHKERRQ(ierr); for (i=0; i<ismax; i++) { ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); } /* Create hash table for the mapping :row -> proc*/ for (i=0,j=0; i<size; i++) { jmax = C->rmap->range[i+1]; for (; j<jmax; j++) { rtable[j] = i; } } /* evaluate communication - mesg to who,length of mesg, and buffer space required. Based on this, buffers are allocated, and data copied into them*/ ierr = PetscMalloc3(2*size,PetscInt,&w1,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr); ierr = PetscMemzero(w1,size*2*sizeof(PetscInt));CHKERRQ(ierr); /* initialize work vector*/ ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialize work vector*/ for (i=0; i<ismax; i++) { ierr = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialize work vector*/ jmax = nrow[i]; irow_i = irow[i]; for (j=0; j<jmax; j++) { row = irow_i[j]; proc = rtable[row]; w4[proc]++; } for (j=0; j<size; j++) { if (w4[j]) { w1[2*j] += w4[j]; w3[j]++;} } } nrqs = 0; /* no of outgoing messages */ msz = 0; /* total mesg length (for all procs) */ w1[2*rank] = 0; /* no mesg sent to self */ w3[rank] = 0; for (i=0; i<size; i++) { if (w1[2*i]) { w1[2*i+1] = 1; nrqs++;} /* there exists a message to proc i */ } ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ for (i=0,j=0; i<size; i++) { if (w1[2*i]) { pa[j] = i; j++; } } /* Each message would have a header = 1 + 2*(no of IS) + data */ for (i=0; i<nrqs; i++) { j = pa[i]; w1[2*j] += w1[2*j+1] + 2* w3[j]; msz += w1[2*j]; } /* Do a global reduction to determine how many messages to expect*/ ierr = PetscMaxSum(comm,w1,&bsz,&nrqr);CHKERRQ(ierr); /* Allocate memory for recv buffers . Make sure rbuf1[0] exists by adding 1 to the buffer length */ ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&rbuf1);CHKERRQ(ierr); ierr = PetscMalloc(nrqr*bsz*sizeof(PetscInt),&rbuf1[0]);CHKERRQ(ierr); for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz; /* Post the receives */ ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&r_waits1);CHKERRQ(ierr); for (i=0; i<nrqr; ++i) { ierr = MPI_Irecv(rbuf1[i],bsz,MPIU_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); } /* Allocate Memory for outgoing messages */ ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); { PetscInt *iptr = tmp,ict = 0; for (i=0; i<nrqs; i++) { j = pa[i]; iptr += ict; sbuf1[j] = iptr; ict = w1[2*j]; } } /* Form the outgoing messages */ /* Initialize the header space */ for (i=0; i<nrqs; i++) { j = pa[i]; sbuf1[j][0] = 0; ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); ptr[j] = sbuf1[j] + 2*w3[j] + 1; } /* Parse the isrow and copy data into outbuf */ for (i=0; i<ismax; i++) { ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); irow_i = irow[i]; jmax = nrow[i]; for (j=0; j<jmax; j++) { /* parse the indices of each IS */ row = irow_i[j]; proc = rtable[row]; if (proc != rank) { /* copy to the outgoing buf*/ ctr[proc]++; *ptr[proc] = row; ptr[proc]++; } } /* Update the headers for the current IS */ for (j=0; j<size; j++) { /* Can Optimise this loop too */ if ((ctr_j = ctr[j])) { sbuf1_j = sbuf1[j]; k = ++sbuf1_j[0]; sbuf1_j[2*k] = ctr_j; sbuf1_j[2*k-1] = i; } } } /* Now post the sends */ ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); for (i=0; i<nrqs; ++i) { j = pa[i]; ierr = MPI_Isend(sbuf1[j],w1[2*j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); } /* Post recieves to capture the row_data from other procs */ ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf2);CHKERRQ(ierr); for (i=0; i<nrqs; i++) { j = pa[i]; count = (w1[2*j] - (2*sbuf1[j][0] + 1))*N; ierr = PetscMalloc((count+1)*sizeof(PetscScalar),&rbuf2[i]);CHKERRQ(ierr); ierr = MPI_Irecv(rbuf2[i],count,MPIU_SCALAR,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); } /* Receive messages(row_nos) and then, pack and send off the rowvalues to the correct processors */ ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf2);CHKERRQ(ierr); { PetscScalar *sbuf2_i,*v_start; PetscInt s_proc; for (i=0; i<nrqr; ++i) { ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); s_proc = r_status1[i].MPI_SOURCE; /* send processor */ rbuf1_i = rbuf1[idex]; /* Actual message from s_proc */ /* no of rows = end - start; since start is array idex[], 0idex, whel end is length of the buffer - which is 1idex */ start = 2*rbuf1_i[0] + 1; ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); /* allocate memory sufficinet to hold all the row values */ ierr = PetscMalloc((end-start)*N*sizeof(PetscScalar),&sbuf2[idex]);CHKERRQ(ierr); sbuf2_i = sbuf2[idex]; /* Now pack the data */ for (j=start; j<end; j++) { row = rbuf1_i[j] - rstart; v_start = a->v + row; for (k=0; k<N; k++) { sbuf2_i[0] = v_start[0]; sbuf2_i++; v_start += C->rmap->n; } } /* Now send off the data */ ierr = MPI_Isend(sbuf2[idex],(end-start)*N,MPIU_SCALAR,s_proc,tag1,comm,s_waits2+i);CHKERRQ(ierr); } } /* End Send-Recv of IS + row_numbers */ ierr = PetscFree(r_status1);CHKERRQ(ierr); ierr = PetscFree(r_waits1);CHKERRQ(ierr); ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} ierr = PetscFree(s_status1);CHKERRQ(ierr); ierr = PetscFree(s_waits1);CHKERRQ(ierr); /* Create the submatrices */ if (scall == MAT_REUSE_MATRIX) { for (i=0; i<ismax; i++) { mat = (Mat_SeqDense *)(submats[i]->data); if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); ierr = PetscMemzero(mat->v,submats[i]->rmap->n*submats[i]->cmap->n*sizeof(PetscScalar));CHKERRQ(ierr); submats[i]->factortype = C->factortype; } } else { for (i=0; i<ismax; i++) { ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr); ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); ierr = MatSeqDenseSetPreallocation(submats[i],PETSC_NULL);CHKERRQ(ierr); } } /* Assemble the matrices */ { PetscInt col; PetscScalar *imat_v,*mat_v,*imat_vi,*mat_vi; for (i=0; i<ismax; i++) { mat = (Mat_SeqDense*)submats[i]->data; mat_v = a->v; imat_v = mat->v; irow_i = irow[i]; m = nrow[i]; for (j=0; j<m; j++) { row = irow_i[j] ; proc = rtable[row]; if (proc == rank) { row = row - rstart; mat_vi = mat_v + row; imat_vi = imat_v + j; for (k=0; k<ncol[i]; k++) { col = icol[i][k]; imat_vi[k*m] = mat_vi[col*C->rmap->n]; } } } } } /* Create row map-> This maps c->row to submat->row for each submat*/ /* this is a very expensive operation wrt memory usage */ ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} for (i=0; i<ismax; i++) { rmap_i = rmap[i]; irow_i = irow[i]; jmax = nrow[i]; for (j=0; j<jmax; j++) { rmap_i[irow_i[j]] = j; } } /* Now Receive the row_values and assemble the rest of the matrix */ ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); { PetscInt is_max,tmp1,col,*sbuf1_i,is_sz; PetscScalar *rbuf2_i,*imat_v,*imat_vi; for (tmp1=0; tmp1<nrqs; tmp1++) { /* For each message */ ierr = MPI_Waitany(nrqs,r_waits2,&i,r_status2+tmp1);CHKERRQ(ierr); /* Now dig out the corresponding sbuf1, which contains the IS data_structure */ sbuf1_i = sbuf1[pa[i]]; is_max = sbuf1_i[0]; ct1 = 2*is_max+1; rbuf2_i = rbuf2[i]; for (j=1; j<=is_max; j++) { /* For each IS belonging to the message */ is_no = sbuf1_i[2*j-1]; is_sz = sbuf1_i[2*j]; mat = (Mat_SeqDense*)submats[is_no]->data; imat_v = mat->v; rmap_i = rmap[is_no]; m = nrow[is_no]; for (k=0; k<is_sz; k++,rbuf2_i+=N) { /* For each row */ row = sbuf1_i[ct1]; ct1++; row = rmap_i[row]; imat_vi = imat_v + row; for (l=0; l<ncol[is_no]; l++) { /* For each col */ col = icol[is_no][l]; imat_vi[l*m] = rbuf2_i[col]; } } } } } /* End Send-Recv of row_values */ ierr = PetscFree(r_status2);CHKERRQ(ierr); ierr = PetscFree(r_waits2);CHKERRQ(ierr); ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} ierr = PetscFree(s_status2);CHKERRQ(ierr); ierr = PetscFree(s_waits2);CHKERRQ(ierr); /* Restore the indices */ for (i=0; i<ismax; i++) { ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); } /* Destroy allocated memory */ ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr); ierr = PetscFree3(w1,w3,w4);CHKERRQ(ierr); ierr = PetscFree(pa);CHKERRQ(ierr); for (i=0; i<nrqs; ++i) { ierr = PetscFree(rbuf2[i]);CHKERRQ(ierr); } ierr = PetscFree(rbuf2);CHKERRQ(ierr); ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); ierr = PetscFree(rbuf1);CHKERRQ(ierr); for (i=0; i<nrqr; ++i) { ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); } ierr = PetscFree(sbuf2);CHKERRQ(ierr); ierr = PetscFree(rmap[0]);CHKERRQ(ierr); ierr = PetscFree(rmap);CHKERRQ(ierr); for (i=0; i<ismax; i++) { ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); } PetscFunctionReturn(0); }