static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners) { PetscInt *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2; PetscInt size=stash->size,nsends; PetscErrorCode ierr; PetscInt count,*sindices,**rindices,i,j,idx,lastidx,l; PetscScalar **rvalues,*svalues; MPI_Comm comm = stash->comm; MPI_Request *send_waits,*recv_waits,*recv_waits1,*recv_waits2; PetscMPIInt *sizes,*nlengths,nreceives; PetscInt *sp_idx,*sp_idy; PetscScalar *sp_val; PetscMatStashSpace space,space_next; PetscFunctionBegin; { /* make sure all processors are either in INSERTMODE or ADDMODE */ InsertMode addv; ierr = MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr); if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added"); mat->insertmode = addv; /* in case this processor had no cache */ } bs2 = stash->bs*stash->bs; /* first count number of contributors to each processor */ ierr = PetscCalloc1(size,&sizes);CHKERRQ(ierr); ierr = PetscCalloc1(size,&nlengths);CHKERRQ(ierr); ierr = PetscMalloc1(stash->n+1,&owner);CHKERRQ(ierr); i = j = 0; lastidx = -1; space = stash->space_head; while (space) { space_next = space->next; sp_idx = space->idx; for (l=0; l<space->local_used; l++) { /* if indices are NOT locally sorted, need to start search at the beginning */ if (lastidx > (idx = sp_idx[l])) j = 0; lastidx = idx; for (; j<size; j++) { if (idx >= owners[j] && idx < owners[j+1]) { nlengths[j]++; owner[i] = j; break; } } i++; } space = space_next; } /* Now check what procs get messages - and compute nsends. */ for (i=0, nsends=0; i<size; i++) { if (nlengths[i]) { sizes[i] = 1; nsends++; } } {PetscMPIInt *onodes,*olengths; /* Determine the number of messages to expect, their lengths, from from-ids */ ierr = PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);CHKERRQ(ierr); ierr = PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);CHKERRQ(ierr); /* since clubbing row,col - lengths are multiplied by 2 */ for (i=0; i<nreceives; i++) olengths[i] *=2; ierr = PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);CHKERRQ(ierr); /* values are size 'bs2' lengths (and remove earlier factor 2 */ for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2; ierr = PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);CHKERRQ(ierr); ierr = PetscFree(onodes);CHKERRQ(ierr); ierr = PetscFree(olengths);CHKERRQ(ierr);} /* do sends: 1) starts[i] gives the starting index in svalues for stuff going to the ith processor */ ierr = PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);CHKERRQ(ierr); ierr = PetscMalloc1(2*nsends,&send_waits);CHKERRQ(ierr); ierr = PetscMalloc2(size,&startv,size,&starti);CHKERRQ(ierr); /* use 2 sends the first with all_a, the next with all_i and all_j */ startv[0] = 0; starti[0] = 0; for (i=1; i<size; i++) { startv[i] = startv[i-1] + nlengths[i-1]; starti[i] = starti[i-1] + 2*nlengths[i-1]; } i = 0; space = stash->space_head; while (space) { space_next = space->next; sp_idx = space->idx; sp_idy = space->idy; sp_val = space->val; for (l=0; l<space->local_used; l++) { j = owner[i]; if (bs2 == 1) { svalues[startv[j]] = sp_val[l]; } else { PetscInt k; PetscScalar *buf1,*buf2; buf1 = svalues+bs2*startv[j]; buf2 = space->val + bs2*l; for (k=0; k<bs2; k++) buf1[k] = buf2[k]; } sindices[starti[j]] = sp_idx[l]; sindices[starti[j]+nlengths[j]] = sp_idy[l]; startv[j]++; starti[j]++; i++; } space = space_next; } startv[0] = 0; for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1]; for (i=0,count=0; i<size; i++) { if (sizes[i]) { ierr = MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);CHKERRQ(ierr); ierr = MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);CHKERRQ(ierr); } } #if defined(PETSC_USE_INFO) ierr = PetscInfo1(NULL,"No of messages: %d \n",nsends);CHKERRQ(ierr); for (i=0; i<size; i++) { if (sizes[i]) { ierr = PetscInfo2(NULL,"Mesg_to: %d: size: %d bytes\n",i,nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr); } } #endif ierr = PetscFree(nlengths);CHKERRQ(ierr); ierr = PetscFree(owner);CHKERRQ(ierr); ierr = PetscFree2(startv,starti);CHKERRQ(ierr); ierr = PetscFree(sizes);CHKERRQ(ierr); /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */ ierr = PetscMalloc1(2*nreceives,&recv_waits);CHKERRQ(ierr); for (i=0; i<nreceives; i++) { recv_waits[2*i] = recv_waits1[i]; recv_waits[2*i+1] = recv_waits2[i]; } stash->recv_waits = recv_waits; ierr = PetscFree(recv_waits1);CHKERRQ(ierr); ierr = PetscFree(recv_waits2);CHKERRQ(ierr); stash->svalues = svalues; stash->sindices = sindices; stash->rvalues = rvalues; stash->rindices = rindices; stash->send_waits = send_waits; stash->nsends = nsends; stash->nrecvs = nreceives; stash->reproduce_count = 0; PetscFunctionReturn(0); }
PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) { PetscErrorCode ierr; Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; Mat_SeqAIJ *p_loc,*p_oth; Mat_PtAPMPI *ptap; Mat_Merge_SeqsToMPI *merge; PetscInt *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp; PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj; PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj; MatScalar *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp; PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; MPI_Comm comm; PetscMPIInt size,rank,taga,*len_s; PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; PetscInt **buf_ri,**buf_rj; PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ MPI_Request *s_waits,*r_waits; MPI_Status *status; MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; PetscInt *api,*apj,*coi,*coj; PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend; PetscBool scalable; #if defined(PTAP_PROFILE) PetscLogDouble t0,t1,t2,t3,t4,et2_AP=0.0,et2_PtAP=0.0,t2_0,t2_1,t2_2; #endif PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t0);CHKERRQ(ierr); #endif ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ptap = c->ptap; if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX"); merge = ptap->merge; apa = ptap->apa; scalable = ptap->scalable; /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ /*--------------------------------------------------*/ if (ptap->reuse == MAT_INITIAL_MATRIX) { /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */ ptap->reuse = MAT_REUSE_MATRIX; } else { /* update numerical values of P_oth and P_loc */ ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); } #if defined(PTAP_PROFILE) ierr = PetscTime(&t1);CHKERRQ(ierr); #endif /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ /*--------------------------------------------------------------*/ /* get data from symbolic products */ p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; coi = merge->coi; coj = merge->coj; ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr); bi = merge->bi; bj = merge->bj; owners = merge->rowmap->range; ierr = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr); /* ba: Cseq->a */ api = ptap->api; apj = ptap->apj; if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but fast */ ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr); /*-----------------------------------------------------------------------------------------------------*/ for (i=0; i<am; i++) { #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_0);CHKERRQ(ierr); #endif /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */ /*------------------------------------------------------------*/ apJ = apj + api[i]; /* diagonal portion of A */ anz = adi[i+1] - adi[i]; adj = ad->j + adi[i]; ada = ad->a + adi[i]; for (j=0; j<anz; j++) { row = adj[j]; pnz = pi_loc[row+1] - pi_loc[row]; pj = pj_loc + pi_loc[row]; pa = pa_loc + pi_loc[row]; /* perform dense axpy */ valtmp = ada[j]; for (k=0; k<pnz; k++) { apa[pj[k]] += valtmp*pa[k]; } ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); } /* off-diagonal portion of A */ anz = aoi[i+1] - aoi[i]; aoj = ao->j + aoi[i]; aoa = ao->a + aoi[i]; for (j=0; j<anz; j++) { row = aoj[j]; pnz = pi_oth[row+1] - pi_oth[row]; pj = pj_oth + pi_oth[row]; pa = pa_oth + pi_oth[row]; /* perform dense axpy */ valtmp = aoa[j]; for (k=0; k<pnz; k++) { apa[pj[k]] += valtmp*pa[k]; } ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); } #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_1);CHKERRQ(ierr); et2_AP += t2_1 - t2_0; #endif /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ /*--------------------------------------------------------------*/ apnz = api[i+1] - api[i]; /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */ pnz = po->i[i+1] - po->i[i]; poJ = po->j + po->i[i]; pA = po->a + po->i[i]; for (j=0; j<pnz; j++) { row = poJ[j]; cnz = coi[row+1] - coi[row]; cj = coj + coi[row]; ca = coa + coi[row]; /* perform dense axpy */ valtmp = pA[j]; for (k=0; k<cnz; k++) { ca[k] += valtmp*apa[cj[k]]; } ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); } /* put the value into Cd (diagonal part) */ pnz = pd->i[i+1] - pd->i[i]; pdJ = pd->j + pd->i[i]; pA = pd->a + pd->i[i]; for (j=0; j<pnz; j++) { row = pdJ[j]; cnz = bi[row+1] - bi[row]; cj = bj + bi[row]; ca = ba + bi[row]; /* perform dense axpy */ valtmp = pA[j]; for (k=0; k<cnz; k++) { ca[k] += valtmp*apa[cj[k]]; } ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); } /* zero the current row of A*P */ for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0; #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_2);CHKERRQ(ierr); et2_PtAP += t2_2 - t2_1; #endif } } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */ ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr); /*-----------------------------------------------------------------------------------------*/ pA=pa_loc; for (i=0; i<am; i++) { #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_0);CHKERRQ(ierr); #endif /* form i-th sparse row of A*P */ apnz = api[i+1] - api[i]; apJ = apj + api[i]; /* diagonal portion of A */ anz = adi[i+1] - adi[i]; adj = ad->j + adi[i]; ada = ad->a + adi[i]; for (j=0; j<anz; j++) { row = adj[j]; pnz = pi_loc[row+1] - pi_loc[row]; pj = pj_loc + pi_loc[row]; pa = pa_loc + pi_loc[row]; valtmp = ada[j]; nextp = 0; for (k=0; nextp<pnz; k++) { if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ apa[k] += valtmp*pa[nextp++]; } } ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); } /* off-diagonal portion of A */ anz = aoi[i+1] - aoi[i]; aoj = ao->j + aoi[i]; aoa = ao->a + aoi[i]; for (j=0; j<anz; j++) { row = aoj[j]; pnz = pi_oth[row+1] - pi_oth[row]; pj = pj_oth + pi_oth[row]; pa = pa_oth + pi_oth[row]; valtmp = aoa[j]; nextp = 0; for (k=0; nextp<pnz; k++) { if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ apa[k] += valtmp*pa[nextp++]; } } ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); } #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_1);CHKERRQ(ierr); et2_AP += t2_1 - t2_0; #endif /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ /*--------------------------------------------------------------*/ pnz = pi_loc[i+1] - pi_loc[i]; pJ = pj_loc + pi_loc[i]; for (j=0; j<pnz; j++) { nextap = 0; row = pJ[j]; /* global index */ if (row < pcstart || row >=pcend) { /* put the value into Co */ row = *poJ; cj = coj + coi[row]; ca = coa + coi[row]; poJ++; } else { /* put the value into Cd */ row = *pdJ; cj = bj + bi[row]; ca = ba + bi[row]; pdJ++; } valtmp = pA[j]; for (k=0; nextap<apnz; k++) { if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++]; } ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); } pA += pnz; /* zero the current row info for A*P */ ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t2_2);CHKERRQ(ierr); et2_PtAP += t2_2 - t2_1; #endif } } #if defined(PTAP_PROFILE) ierr = PetscTime(&t2);CHKERRQ(ierr); #endif /* 3) send and recv matrix values coa */ /*------------------------------------*/ buf_ri = merge->buf_ri; buf_rj = merge->buf_rj; len_s = merge->len_s; ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr); for (proc=0,k=0; proc<size; proc++) { if (!len_s[proc]) continue; i = merge->owners_co[proc]; ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); k++; } if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); ierr = PetscFree(r_waits);CHKERRQ(ierr); ierr = PetscFree(coa);CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t3);CHKERRQ(ierr); #endif /* 4) insert local Cseq and received values into Cmpi */ /*------------------------------------------------------*/ ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); for (k=0; k<merge->nrecv; k++) { buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ nrows = *(buf_ri_k[k]); nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ } for (i=0; i<cm; i++) { row = owners[rank] + i; /* global row index of C_seq */ bj_i = bj + bi[i]; /* col indices of the i-th row of C */ ba_i = ba + bi[i]; bnz = bi[i+1] - bi[i]; /* add received vals into ba */ for (k=0; k<merge->nrecv; k++) { /* k-th received message */ /* i-th row */ if (i == *nextrow[k]) { cnz = *(nextci[k]+1) - *nextci[k]; cj = buf_rj[k] + *(nextci[k]); ca = abuf_r[k] + *(nextci[k]); nextcj = 0; for (j=0; nextcj<cnz; j++) { if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ ba_i[j] += ca[nextcj++]; } } nextrow[k]++; nextci[k]++; ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); } } ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); } ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = PetscFree(ba);CHKERRQ(ierr); ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); ierr = PetscFree(abuf_r);CHKERRQ(ierr); ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t4);CHKERRQ(ierr); if (rank==1) PetscPrintf(MPI_COMM_SELF," [%d] PtAPNum %g/P + %g/PtAP( %g + %g ) + %g/comm + %g/Cloc = %g\n\n",rank,t1-t0,t2-t1,et2_AP,et2_PtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr); #endif PetscFunctionReturn(0); }