PETSC_EXTERN PetscErrorCode MatISSetMPIXAIJPreallocation_Private(Mat A, Mat B, PetscBool maxreduce) { Mat_IS *matis = (Mat_IS*)(A->data); PetscInt *my_dnz,*my_onz,*dnz,*onz,*mat_ranges,*row_ownership; const PetscInt *global_indices_r,*global_indices_c; PetscInt i,j,bs,rows,cols; PetscInt lrows,lcols; PetscInt local_rows,local_cols; PetscMPIInt nsubdomains; PetscBool isdense,issbaij; PetscErrorCode ierr; PetscFunctionBegin; ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&nsubdomains);CHKERRQ(ierr); ierr = MatGetSize(A,&rows,&cols);CHKERRQ(ierr); ierr = MatGetBlockSize(A,&bs);CHKERRQ(ierr); ierr = MatGetSize(matis->A,&local_rows,&local_cols);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQDENSE,&isdense);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQSBAIJ,&issbaij);CHKERRQ(ierr); ierr = ISLocalToGlobalMappingGetIndices(A->rmap->mapping,&global_indices_r);CHKERRQ(ierr); if (A->rmap->mapping != A->cmap->mapping) { ierr = ISLocalToGlobalMappingGetIndices(A->rmap->mapping,&global_indices_c);CHKERRQ(ierr); } else { global_indices_c = global_indices_r; } if (issbaij) { ierr = MatGetRowUpperTriangular(matis->A);CHKERRQ(ierr); } /* An SF reduce is needed to sum up properly on shared rows. Note that generally preallocation is not exact, since it overestimates nonzeros */ if (!matis->sf) { /* setup SF if not yet created and allocate rootdata and leafdata */ ierr = MatISComputeSF_Private(A);CHKERRQ(ierr); } ierr = MatGetLocalSize(A,&lrows,&lcols);CHKERRQ(ierr); ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)A),lrows,lcols,dnz,onz);CHKERRQ(ierr); /* All processes need to compute entire row ownership */ ierr = PetscMalloc1(rows,&row_ownership);CHKERRQ(ierr); ierr = MatGetOwnershipRanges(A,(const PetscInt**)&mat_ranges);CHKERRQ(ierr); for (i=0;i<nsubdomains;i++) { for (j=mat_ranges[i];j<mat_ranges[i+1];j++) { row_ownership[j] = i; } } /* my_dnz and my_onz contains exact contribution to preallocation from each local mat then, they will be summed up properly. This way, preallocation is always sufficient */ ierr = PetscCalloc2(local_rows,&my_dnz,local_rows,&my_onz);CHKERRQ(ierr); /* preallocation as a MATAIJ */ if (isdense) { /* special case for dense local matrices */ for (i=0;i<local_rows;i++) { PetscInt index_row = global_indices_r[i]; for (j=i;j<local_rows;j++) { PetscInt owner = row_ownership[index_row]; PetscInt index_col = global_indices_c[j]; if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */ my_dnz[i] += 1; } else { /* offdiag block */ my_onz[i] += 1; } /* same as before, interchanging rows and cols */ if (i != j) { owner = row_ownership[index_col]; if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) { my_dnz[j] += 1; } else { my_onz[j] += 1; } } } } } else { /* TODO: this could be optimized using MatGetRowIJ */ for (i=0;i<local_rows;i++) { const PetscInt *cols; PetscInt ncols,index_row = global_indices_r[i]; ierr = MatGetRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr); for (j=0;j<ncols;j++) { PetscInt owner = row_ownership[index_row]; PetscInt index_col = global_indices_c[cols[j]]; if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */ my_dnz[i] += 1; } else { /* offdiag block */ my_onz[i] += 1; } /* same as before, interchanging rows and cols */ if (issbaij && index_col != index_row) { owner = row_ownership[index_col]; if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) { my_dnz[cols[j]] += 1; } else { my_onz[cols[j]] += 1; } } } ierr = MatRestoreRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr); } } ierr = ISLocalToGlobalMappingRestoreIndices(A->rmap->mapping,&global_indices_r);CHKERRQ(ierr); if (global_indices_c != global_indices_r) { ierr = ISLocalToGlobalMappingRestoreIndices(A->rmap->mapping,&global_indices_c);CHKERRQ(ierr); } ierr = PetscFree(row_ownership);CHKERRQ(ierr); /* Reduce my_dnz and my_onz */ if (maxreduce) { ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_dnz,dnz,MPI_MAX);CHKERRQ(ierr); ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_dnz,dnz,MPI_MAX);CHKERRQ(ierr); ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_onz,onz,MPI_MAX);CHKERRQ(ierr); ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_onz,onz,MPI_MAX);CHKERRQ(ierr); } else { ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_dnz,dnz,MPI_SUM);CHKERRQ(ierr); ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_dnz,dnz,MPI_SUM);CHKERRQ(ierr); ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_onz,onz,MPI_SUM);CHKERRQ(ierr); ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_onz,onz,MPI_SUM);CHKERRQ(ierr); } ierr = PetscFree2(my_dnz,my_onz);CHKERRQ(ierr); /* Resize preallocation if overestimated */ for (i=0;i<lrows;i++) { dnz[i] = PetscMin(dnz[i],lcols); onz[i] = PetscMin(onz[i],cols-lcols); } /* set preallocation */ ierr = MatMPIAIJSetPreallocation(B,0,dnz,0,onz);CHKERRQ(ierr); for (i=0;i<lrows/bs;i++) { dnz[i] = dnz[i*bs]/bs; onz[i] = onz[i*bs]/bs; } ierr = MatMPIBAIJSetPreallocation(B,bs,0,dnz,0,onz);CHKERRQ(ierr); ierr = MatMPISBAIJSetPreallocation(B,bs,0,dnz,0,onz);CHKERRQ(ierr); ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); if (issbaij) { ierr = MatRestoreRowUpperTriangular(matis->A);CHKERRQ(ierr); } PetscFunctionReturn(0); }
PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) { PetscErrorCode ierr; Mat Cmpi; Mat_PtAPMPI *ptap; PetscFreeSpaceList free_space=NULL,current_space=NULL; Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; Mat_SeqAIJ *p_loc,*p_oth; PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ; PetscInt *adi=ad->i,*aj,*aoi=ao->i,nnz; PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; PetscInt am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n; PetscBT lnkbt; MPI_Comm comm; PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0; PetscInt **buf_rj,**buf_ri,**buf_ri_k; PetscInt len,proc,*dnz,*onz,*owners; PetscInt nzi,*pti,*ptj; PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; MPI_Request *swaits,*rwaits; MPI_Status *sstatus,rstatus; Mat_Merge_SeqsToMPI *merge; PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0; PetscReal afill=1.0,afill_tmp; PetscInt rmax; #if defined(PTAP_PROFILE) PetscLogDouble t0,t1,t2,t3,t4; #endif PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t0);CHKERRQ(ierr); #endif /* check if matrix local sizes are compatible */ if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) { SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); } if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) { SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); } ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); /* create struct Mat_PtAPMPI and attached it to C later */ ierr = PetscNew(&ptap);CHKERRQ(ierr); ierr = PetscNew(&merge);CHKERRQ(ierr); ptap->merge = merge; ptap->reuse = MAT_INITIAL_MATRIX; /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); /* get P_loc by taking all local rows of P */ ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; pi_loc = p_loc->i; pj_loc = p_loc->j; pi_oth = p_oth->i; pj_oth = p_oth->j; #if defined(PTAP_PROFILE) ierr = PetscTime(&t1);CHKERRQ(ierr); #endif /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ /*-------------------------------------------------------------------*/ ierr = PetscMalloc1((am+1),&api);CHKERRQ(ierr); api[0] = 0; /* create and initialize a linked list */ ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr); /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */ ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); current_space = free_space; for (i=0; i<am; i++) { /* diagonal portion of A */ nzi = adi[i+1] - adi[i]; aj = ad->j + adi[i]; for (j=0; j<nzi; j++) { row = aj[j]; pnz = pi_loc[row+1] - pi_loc[row]; Jptr = pj_loc + pi_loc[row]; /* add non-zero cols of P into the sorted linked list lnk */ ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); } /* off-diagonal portion of A */ nzi = aoi[i+1] - aoi[i]; aj = ao->j + aoi[i]; for (j=0; j<nzi; j++) { row = aj[j]; pnz = pi_oth[row+1] - pi_oth[row]; Jptr = pj_oth + pi_oth[row]; ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); } apnz = lnk[0]; api[i+1] = api[i] + apnz; if (ap_rmax < apnz) ap_rmax = apnz; /* if free space is not available, double the total space in the list */ if (current_space->local_remaining<apnz) { ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); nspacedouble++; } /* Copy data into free space, then initialize lnk */ ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); current_space->array += apnz; current_space->local_used += apnz; current_space->local_remaining -= apnz; } /* Allocate space for apj, initialize apj, and */ /* destroy list of free space and other temporary array(s) */ ierr = PetscMalloc1((api[am]+1),&apj);CHKERRQ(ierr); ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1); if (afill_tmp > afill) afill = afill_tmp; #if defined(PTAP_PROFILE) ierr = PetscTime(&t2);CHKERRQ(ierr); #endif /* determine symbolic Co=(p->B)^T*AP - send to others */ /*----------------------------------------------------*/ ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); /* then, compute symbolic Co = (p->B)^T*AP */ pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */ ierr = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr); coi[0] = 0; /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */ nnz = fill*(poti[pon] + api[am]); ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); current_space = free_space; for (i=0; i<pon; i++) { pnz = poti[i+1] - poti[i]; ptJ = potj + poti[i]; for (j=0; j<pnz; j++) { row = ptJ[j]; /* row of AP == col of Pot */ apnz = api[row+1] - api[row]; Jptr = apj + api[row]; /* add non-zero cols of AP into the sorted linked list lnk */ ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); } nnz = lnk[0]; /* If free space is not available, double the total space in the list */ if (current_space->local_remaining<nnz) { ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); nspacedouble++; } /* Copy data into free space, and zero out denserows */ ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); current_space->array += nnz; current_space->local_used += nnz; current_space->local_remaining -= nnz; coi[i+1] = coi[i] + nnz; } ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr); ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1); if (afill_tmp > afill) afill = afill_tmp; ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); /* send j-array (coj) of Co to other processors */ /*----------------------------------------------*/ /* determine row ownership */ ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); merge->rowmap->n = pn; merge->rowmap->bs = 1; ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); owners = merge->rowmap->range; /* determine the number of messages to send, their lengths */ ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr); ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr); len_s = merge->len_s; merge->nsend = 0; ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr); proc = 0; for (i=0; i<pon; i++) { while (prmap[i] >= owners[proc+1]) proc++; len_si[proc]++; /* num of rows in Co to be sent to [proc] */ len_s[proc] += coi[i+1] - coi[i]; } len = 0; /* max length of buf_si[] */ owners_co[0] = 0; for (proc=0; proc<size; proc++) { owners_co[proc+1] = owners_co[proc] + len_si[proc]; if (len_si[proc]) { merge->nsend++; len_si[proc] = 2*(len_si[proc] + 1); len += len_si[proc]; } } /* determine the number and length of messages to receive for coi and coj */ ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr); ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); /* post the Irecv and Isend of coj */ ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr); for (proc=0, k=0; proc<size; proc++) { if (!len_s[proc]) continue; i = owners_co[proc]; ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); k++; } /* receives and sends of coj are complete */ for (i=0; i<merge->nrecv; i++) { ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); } ierr = PetscFree(rwaits);CHKERRQ(ierr); if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} /* send and recv coi */ /*-------------------*/ ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); ierr = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr); buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ for (proc=0,k=0; proc<size; proc++) { if (!len_s[proc]) continue; /* form outgoing message for i-structure: buf_si[0]: nrows to be sent [1:nrows]: row index (global) [nrows+1:2*nrows+1]: i-structure index */ /*-------------------------------------------*/ nrows = len_si[proc]/2 - 1; buf_si_i = buf_si + nrows+1; buf_si[0] = nrows; buf_si_i[0] = 0; nrows = 0; for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { nzi = coi[i+1] - coi[i]; buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ nrows++; } ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); k++; buf_si += len_si[proc]; } i = merge->nrecv; while (i--) { ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); } ierr = PetscFree(rwaits);CHKERRQ(ierr); if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr); ierr = PetscFree(len_ri);CHKERRQ(ierr); ierr = PetscFree(swaits);CHKERRQ(ierr); ierr = PetscFree(buf_s);CHKERRQ(ierr); #if defined(PTAP_PROFILE) ierr = PetscTime(&t3);CHKERRQ(ierr); #endif /* compute the local portion of C (mpi mat) */ /*------------------------------------------*/ ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); /* allocate pti array and free space for accumulating nonzero column info */ ierr = PetscMalloc1((pn+1),&pti);CHKERRQ(ierr); pti[0] = 0; /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ nnz = fill*(pi_loc[pm] + api[am]); ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); current_space = free_space; ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); for (k=0; k<merge->nrecv; k++) { buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ nrows = *buf_ri_k[k]; nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ } ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); rmax = 0; for (i=0; i<pn; i++) { /* add pdt[i,:]*AP into lnk */ pnz = pdti[i+1] - pdti[i]; ptJ = pdtj + pdti[i]; for (j=0; j<pnz; j++) { row = ptJ[j]; /* row of AP == col of Pt */ apnz = api[row+1] - api[row]; Jptr = apj + api[row]; /* add non-zero cols of AP into the sorted linked list lnk */ ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); } /* add received col data into lnk */ for (k=0; k<merge->nrecv; k++) { /* k-th received message */ if (i == *nextrow[k]) { /* i-th row */ nzi = *(nextci[k]+1) - *nextci[k]; Jptr = buf_rj[k] + *nextci[k]; ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); nextrow[k]++; nextci[k]++; } } nnz = lnk[0]; /* if free space is not available, make more free space */ if (current_space->local_remaining<nnz) { ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); nspacedouble++; } /* copy data into free space, then initialize lnk */ ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); current_space->array += nnz; current_space->local_used += nnz; current_space->local_remaining -= nnz; pti[i+1] = pti[i] + nnz; if (nnz > rmax) rmax = nnz; } ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); ierr = PetscMalloc1((pti[pn]+1),&ptj);CHKERRQ(ierr); ierr = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr); afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1); if (afill_tmp > afill) afill = afill_tmp; ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); /* create symbolic parallel matrix Cmpi */ /*--------------------------------------*/ ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,P->cmap->bs);CHKERRQ(ierr); ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); merge->bi = pti; /* Cseq->i */ merge->bj = ptj; /* Cseq->j */ merge->coi = coi; /* Co->i */ merge->coj = coj; /* Co->j */ merge->buf_ri = buf_ri; merge->buf_rj = buf_rj; merge->owners_co = owners_co; merge->destroy = Cmpi->ops->destroy; merge->duplicate = Cmpi->ops->duplicate; /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ Cmpi->assembled = PETSC_FALSE; Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; /* attach the supporting struct to Cmpi for reuse */ c = (Mat_MPIAIJ*)Cmpi->data; c->ptap = ptap; ptap->api = api; ptap->apj = apj; ptap->rmax = ap_rmax; *C = Cmpi; /* flag 'scalable' determines which implementations to be used: 0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa; 1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */ /* set default scalable */ ptap->scalable = PETSC_TRUE; ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr); if (!ptap->scalable) { /* Do dense axpy */ ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); } else { ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr); } #if defined(PTAP_PROFILE) ierr = PetscTime(&t4);CHKERRQ(ierr); if (rank==1) PetscPrintf(MPI_COMM_SELF," [%d] PtAPSymbolic %g/P + %g/AP + %g/comm + %g/PtAP = %g\n",rank,t1-t0,t2-t1,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr); #endif #if defined(PETSC_USE_INFO) if (pti[pn] != 0) { ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr); } else { ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); } #endif PetscFunctionReturn(0); }
static PetscErrorCode DMCreateMatrix_Composite_AIJ(DM dm,MatType mtype,Mat *J) { PetscErrorCode ierr; DM_Composite *com = (DM_Composite*)dm->data; struct DMCompositeLink *next; PetscInt m,*dnz,*onz,i,j,mA; Mat Atmp; PetscMPIInt rank; PetscBool dense = PETSC_FALSE; PetscFunctionBegin; /* use global vector to determine layout needed for matrix */ m = com->n; ierr = MatCreate(PetscObjectComm((PetscObject)dm),J);CHKERRQ(ierr); ierr = MatSetSizes(*J,m,m,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); ierr = MatSetType(*J,mtype);CHKERRQ(ierr); /* Extremely inefficient but will compute entire Jacobian for testing */ ierr = PetscOptionsGetBool(((PetscObject)dm)->prefix,"-dmcomposite_dense_jacobian",&dense,NULL);CHKERRQ(ierr); if (dense) { PetscInt rstart,rend,*indices; PetscScalar *values; mA = com->N; ierr = MatMPIAIJSetPreallocation(*J,mA,NULL,mA-m,NULL);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(*J,mA,NULL);CHKERRQ(ierr); ierr = MatGetOwnershipRange(*J,&rstart,&rend);CHKERRQ(ierr); ierr = PetscMalloc2(mA,PetscScalar,&values,mA,PetscInt,&indices);CHKERRQ(ierr); ierr = PetscMemzero(values,mA*sizeof(PetscScalar));CHKERRQ(ierr); for (i=0; i<mA; i++) indices[i] = i; for (i=rstart; i<rend; i++) { ierr = MatSetValues(*J,1,&i,mA,indices,values,INSERT_VALUES);CHKERRQ(ierr); } ierr = PetscFree2(values,indices);CHKERRQ(ierr); ierr = MatAssemblyBegin(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); PetscFunctionReturn(0); } ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)dm),&rank);CHKERRQ(ierr); ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)dm),m,m,dnz,onz);CHKERRQ(ierr); /* loop over packed objects, handling one at at time */ next = com->next; while (next) { PetscInt nc,rstart,*ccols,maxnc; const PetscInt *cols,*rstarts; PetscMPIInt proc; ierr = DMCreateMatrix(next->dm,mtype,&Atmp);CHKERRQ(ierr); ierr = MatGetOwnershipRange(Atmp,&rstart,NULL);CHKERRQ(ierr); ierr = MatGetOwnershipRanges(Atmp,&rstarts);CHKERRQ(ierr); ierr = MatGetLocalSize(Atmp,&mA,NULL);CHKERRQ(ierr); maxnc = 0; for (i=0; i<mA; i++) { ierr = MatGetRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr); ierr = MatRestoreRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr); maxnc = PetscMax(nc,maxnc); } ierr = PetscMalloc(maxnc*sizeof(PetscInt),&ccols);CHKERRQ(ierr); for (i=0; i<mA; i++) { ierr = MatGetRow(Atmp,rstart+i,&nc,&cols,NULL);CHKERRQ(ierr); /* remap the columns taking into how much they are shifted on each process */ for (j=0; j<nc; j++) { proc = 0; while (cols[j] >= rstarts[proc+1]) proc++; ccols[j] = cols[j] + next->grstarts[proc] - rstarts[proc]; } ierr = MatPreallocateSet(com->rstart+next->rstart+i,nc,ccols,dnz,onz);CHKERRQ(ierr); ierr = MatRestoreRow(Atmp,rstart+i,&nc,&cols,NULL);CHKERRQ(ierr); } ierr = PetscFree(ccols);CHKERRQ(ierr); ierr = MatDestroy(&Atmp);CHKERRQ(ierr); next = next->next; } if (com->FormCoupleLocations) { ierr = (*com->FormCoupleLocations)(dm,NULL,dnz,onz,__rstart,__nrows,__start,__end);CHKERRQ(ierr); } ierr = MatMPIAIJSetPreallocation(*J,0,dnz,0,onz);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(*J,0,dnz);CHKERRQ(ierr); ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); if (dm->prealloc_only) PetscFunctionReturn(0); next = com->next; while (next) { PetscInt nc,rstart,row,maxnc,*ccols; const PetscInt *cols,*rstarts; const PetscScalar *values; PetscMPIInt proc; ierr = DMCreateMatrix(next->dm,mtype,&Atmp);CHKERRQ(ierr); ierr = MatGetOwnershipRange(Atmp,&rstart,NULL);CHKERRQ(ierr); ierr = MatGetOwnershipRanges(Atmp,&rstarts);CHKERRQ(ierr); ierr = MatGetLocalSize(Atmp,&mA,NULL);CHKERRQ(ierr); maxnc = 0; for (i=0; i<mA; i++) { ierr = MatGetRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr); ierr = MatRestoreRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr); maxnc = PetscMax(nc,maxnc); } ierr = PetscMalloc(maxnc*sizeof(PetscInt),&ccols);CHKERRQ(ierr); for (i=0; i<mA; i++) { ierr = MatGetRow(Atmp,rstart+i,&nc,(const PetscInt**)&cols,&values);CHKERRQ(ierr); for (j=0; j<nc; j++) { proc = 0; while (cols[j] >= rstarts[proc+1]) proc++; ccols[j] = cols[j] + next->grstarts[proc] - rstarts[proc]; } row = com->rstart+next->rstart+i; ierr = MatSetValues(*J,1,&row,nc,ccols,values,INSERT_VALUES);CHKERRQ(ierr); ierr = MatRestoreRow(Atmp,rstart+i,&nc,(const PetscInt**)&cols,&values);CHKERRQ(ierr); } ierr = PetscFree(ccols);CHKERRQ(ierr); ierr = MatDestroy(&Atmp);CHKERRQ(ierr); next = next->next; } if (com->FormCoupleLocations) { PetscInt __rstart; ierr = MatGetOwnershipRange(*J,&__rstart,NULL);CHKERRQ(ierr); ierr = (*com->FormCoupleLocations)(dm,*J,NULL,NULL,__rstart,0,0,0);CHKERRQ(ierr); } ierr = MatAssemblyBegin(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); PetscFunctionReturn(0); }
PetscErrorCode MatISGetMPIXAIJ_IS(Mat mat, MatReuse reuse, Mat *M) { Mat_IS *matis = (Mat_IS*)(mat->data); /* info on mat */ /* ISLocalToGlobalMapping rmapping,cmapping; */ PetscInt bs,rows,cols; PetscInt lrows,lcols; PetscInt local_rows,local_cols; PetscBool isdense,issbaij,issbaij_red; /* values insertion */ PetscScalar *array; PetscInt *local_indices,*global_indices; /* work */ PetscInt i,j,index_row; PetscErrorCode ierr; PetscFunctionBegin; /* MISSING CHECKS - rectangular case not covered (it is not allowed by MATIS) */ /* get info from mat */ /* ierr = MatGetLocalToGlobalMapping(mat,&rmapping,&cmapping);CHKERRQ(ierr); */ ierr = MatGetSize(mat,&rows,&cols);CHKERRQ(ierr); ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr); ierr = MatGetSize(matis->A,&local_rows,&local_cols);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQDENSE,&isdense);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQSBAIJ,&issbaij);CHKERRQ(ierr); /* work */ ierr = PetscMalloc1(local_rows,&local_indices);CHKERRQ(ierr); for (i=0;i<local_rows;i++) local_indices[i]=i; /* map indices of local mat to global values */ ierr = PetscMalloc(PetscMax(local_cols,local_rows)*sizeof(*global_indices),&global_indices);CHKERRQ(ierr); /* ierr = ISLocalToGlobalMappingApply(rmapping,local_rows,local_indices,global_indices);CHKERRQ(ierr); */ ierr = ISLocalToGlobalMappingApply(matis->mapping,local_rows,local_indices,global_indices);CHKERRQ(ierr); if (issbaij) { ierr = MatGetRowUpperTriangular(matis->A);CHKERRQ(ierr); } if (reuse == MAT_INITIAL_MATRIX) { Mat new_mat; MatType new_mat_type; Vec vec_dnz,vec_onz; PetscScalar *my_dnz,*my_onz; PetscInt *dnz,*onz,*mat_ranges,*row_ownership; PetscInt index_col,owner; PetscMPIInt nsubdomains; /* determining new matrix type */ ierr = MPI_Allreduce(&issbaij,&issbaij_red,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr); if (issbaij_red) { new_mat_type = MATSBAIJ; } else { if (bs>1) { new_mat_type = MATBAIJ; } else { new_mat_type = MATAIJ; } } ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&nsubdomains);CHKERRQ(ierr); ierr = MatCreate(PetscObjectComm((PetscObject)mat),&new_mat);CHKERRQ(ierr); ierr = MatSetSizes(new_mat,PETSC_DECIDE,PETSC_DECIDE,rows,cols);CHKERRQ(ierr); ierr = MatSetBlockSize(new_mat,bs);CHKERRQ(ierr); ierr = MatSetType(new_mat,new_mat_type);CHKERRQ(ierr); ierr = MatSetUp(new_mat);CHKERRQ(ierr); ierr = MatGetLocalSize(new_mat,&lrows,&lcols);CHKERRQ(ierr); /* preallocation */ ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)new_mat),lrows,lcols,dnz,onz);CHKERRQ(ierr); /* Some vectors are needed to sum up properly on shared interface dofs. Preallocation macros cannot do the job. Note that preallocation is not exact, since it overestimates nonzeros */ ierr = MatCreateVecs(new_mat,NULL,&vec_dnz);CHKERRQ(ierr); /* ierr = VecSetLocalToGlobalMapping(vec_dnz,rmapping);CHKERRQ(ierr); */ ierr = VecSetLocalToGlobalMapping(vec_dnz,matis->mapping);CHKERRQ(ierr); ierr = VecDuplicate(vec_dnz,&vec_onz);CHKERRQ(ierr); /* All processes need to compute entire row ownership */ ierr = PetscMalloc1(rows,&row_ownership);CHKERRQ(ierr); ierr = MatGetOwnershipRanges(new_mat,(const PetscInt**)&mat_ranges);CHKERRQ(ierr); for (i=0;i<nsubdomains;i++) { for (j=mat_ranges[i];j<mat_ranges[i+1];j++) { row_ownership[j]=i; } } /* my_dnz and my_onz contains exact contribution to preallocation from each local mat then, they will be summed up properly. This way, preallocation is always sufficient */ ierr = PetscMalloc1(local_rows,&my_dnz);CHKERRQ(ierr); ierr = PetscMalloc1(local_rows,&my_onz);CHKERRQ(ierr); ierr = PetscMemzero(my_dnz,local_rows*sizeof(*my_dnz));CHKERRQ(ierr); ierr = PetscMemzero(my_onz,local_rows*sizeof(*my_onz));CHKERRQ(ierr); /* preallocation as a MATAIJ */ if (isdense) { /* special case for dense local matrices */ for (i=0;i<local_rows;i++) { index_row = global_indices[i]; for (j=i;j<local_rows;j++) { owner = row_ownership[index_row]; index_col = global_indices[j]; if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */ my_dnz[i] += 1.0; } else { /* offdiag block */ my_onz[i] += 1.0; } /* same as before, interchanging rows and cols */ if (i != j) { owner = row_ownership[index_col]; if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) { my_dnz[j] += 1.0; } else { my_onz[j] += 1.0; } } } } } else { for (i=0;i<local_rows;i++) { PetscInt ncols; const PetscInt *cols; index_row = global_indices[i]; ierr = MatGetRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr); for (j=0;j<ncols;j++) { owner = row_ownership[index_row]; index_col = global_indices[cols[j]]; if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */ my_dnz[i] += 1.0; } else { /* offdiag block */ my_onz[i] += 1.0; } /* same as before, interchanging rows and cols */ if (issbaij) { owner = row_ownership[index_col]; if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) { my_dnz[j] += 1.0; } else { my_onz[j] += 1.0; } } } ierr = MatRestoreRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr); } } ierr = VecSet(vec_dnz,0.0);CHKERRQ(ierr); ierr = VecSet(vec_onz,0.0);CHKERRQ(ierr); if (local_rows) { /* multilevel guard */ ierr = VecSetValuesLocal(vec_dnz,local_rows,local_indices,my_dnz,ADD_VALUES);CHKERRQ(ierr); ierr = VecSetValuesLocal(vec_onz,local_rows,local_indices,my_onz,ADD_VALUES);CHKERRQ(ierr); } ierr = VecAssemblyBegin(vec_dnz);CHKERRQ(ierr); ierr = VecAssemblyBegin(vec_onz);CHKERRQ(ierr); ierr = VecAssemblyEnd(vec_dnz);CHKERRQ(ierr); ierr = VecAssemblyEnd(vec_onz);CHKERRQ(ierr); ierr = PetscFree(my_dnz);CHKERRQ(ierr); ierr = PetscFree(my_onz);CHKERRQ(ierr); ierr = PetscFree(row_ownership);CHKERRQ(ierr); /* set computed preallocation in dnz and onz */ ierr = VecGetArray(vec_dnz,&array);CHKERRQ(ierr); for (i=0; i<lrows; i++) dnz[i] = (PetscInt)PetscRealPart(array[i]); ierr = VecRestoreArray(vec_dnz,&array);CHKERRQ(ierr); ierr = VecGetArray(vec_onz,&array);CHKERRQ(ierr); for (i=0;i<lrows;i++) onz[i] = (PetscInt)PetscRealPart(array[i]); ierr = VecRestoreArray(vec_onz,&array);CHKERRQ(ierr); ierr = VecDestroy(&vec_dnz);CHKERRQ(ierr); ierr = VecDestroy(&vec_onz);CHKERRQ(ierr); /* Resize preallocation if overestimated */ for (i=0;i<lrows;i++) { dnz[i] = PetscMin(dnz[i],lcols); onz[i] = PetscMin(onz[i],cols-lcols); } /* set preallocation */ ierr = MatMPIAIJSetPreallocation(new_mat,0,dnz,0,onz);CHKERRQ(ierr); for (i=0;i<lrows/bs;i++) { dnz[i] = dnz[i*bs]/bs; onz[i] = onz[i*bs]/bs; } ierr = MatMPIBAIJSetPreallocation(new_mat,bs,0,dnz,0,onz);CHKERRQ(ierr); for (i=0;i<lrows/bs;i++) { dnz[i] = dnz[i]-i; } ierr = MatMPISBAIJSetPreallocation(new_mat,bs,0,dnz,0,onz);CHKERRQ(ierr); ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); *M = new_mat; } else { PetscInt mbs,mrows,mcols; /* some checks */ ierr = MatGetBlockSize(*M,&mbs);CHKERRQ(ierr); ierr = MatGetSize(*M,&mrows,&mcols);CHKERRQ(ierr); if (mrows != rows) { SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong number of rows (%d != %d)",rows,mrows); } if (mrows != rows) { SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong number of cols (%d != %d)",cols,mcols); } if (mbs != bs) { SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong block size (%d != %d)",bs,mbs); } ierr = MatZeroEntries(*M);CHKERRQ(ierr); } /* set local to global mappings */ /* ierr = MatSetLocalToGlobalMapping(*M,rmapping,cmapping);CHKERRQ(ierr); */ /* Set values */ if (isdense) { /* special case for dense local matrices */ ierr = MatSetOption(*M,MAT_ROW_ORIENTED,PETSC_FALSE);CHKERRQ(ierr); ierr = MatDenseGetArray(matis->A,&array);CHKERRQ(ierr); ierr = MatSetValues(*M,local_rows,global_indices,local_cols,global_indices,array,ADD_VALUES);CHKERRQ(ierr); ierr = MatDenseRestoreArray(matis->A,&array);CHKERRQ(ierr); ierr = PetscFree(local_indices);CHKERRQ(ierr); ierr = PetscFree(global_indices);CHKERRQ(ierr); } else { /* very basic values insertion for all other matrix types */ ierr = PetscFree(local_indices);CHKERRQ(ierr); for (i=0;i<local_rows;i++) { ierr = MatGetRow(matis->A,i,&j,(const PetscInt**)&local_indices,(const PetscScalar**)&array);CHKERRQ(ierr); /* ierr = MatSetValuesLocal(*M,1,&i,j,local_indices,array,ADD_VALUES);CHKERRQ(ierr); */ ierr = ISLocalToGlobalMappingApply(matis->mapping,j,local_indices,global_indices);CHKERRQ(ierr); ierr = ISLocalToGlobalMappingApply(matis->mapping,1,&i,&index_row);CHKERRQ(ierr); ierr = MatSetValues(*M,1,&index_row,j,global_indices,array,ADD_VALUES);CHKERRQ(ierr); ierr = MatRestoreRow(matis->A,i,&j,(const PetscInt**)&local_indices,(const PetscScalar**)&array);CHKERRQ(ierr); } ierr = PetscFree(global_indices);CHKERRQ(ierr); } ierr = MatAssemblyBegin(*M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(*M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); if (isdense) { ierr = MatSetOption(*M,MAT_ROW_ORIENTED,PETSC_TRUE);CHKERRQ(ierr); } if (issbaij) { ierr = MatRestoreRowUpperTriangular(matis->A);CHKERRQ(ierr); } PetscFunctionReturn(0); }