Beispiel #1
0
PETSC_EXTERN PetscErrorCode MatISSetMPIXAIJPreallocation_Private(Mat A, Mat B, PetscBool maxreduce)
{
  Mat_IS          *matis = (Mat_IS*)(A->data);
  PetscInt        *my_dnz,*my_onz,*dnz,*onz,*mat_ranges,*row_ownership;
  const PetscInt  *global_indices_r,*global_indices_c;
  PetscInt        i,j,bs,rows,cols;
  PetscInt        lrows,lcols;
  PetscInt        local_rows,local_cols;
  PetscMPIInt     nsubdomains;
  PetscBool       isdense,issbaij;
  PetscErrorCode  ierr;

  PetscFunctionBegin;
  ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&nsubdomains);CHKERRQ(ierr);
  ierr = MatGetSize(A,&rows,&cols);CHKERRQ(ierr);
  ierr = MatGetBlockSize(A,&bs);CHKERRQ(ierr);
  ierr = MatGetSize(matis->A,&local_rows,&local_cols);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQDENSE,&isdense);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQSBAIJ,&issbaij);CHKERRQ(ierr);
  ierr = ISLocalToGlobalMappingGetIndices(A->rmap->mapping,&global_indices_r);CHKERRQ(ierr);
  if (A->rmap->mapping != A->cmap->mapping) {
    ierr = ISLocalToGlobalMappingGetIndices(A->rmap->mapping,&global_indices_c);CHKERRQ(ierr);
  } else {
    global_indices_c = global_indices_r;
  }

  if (issbaij) {
    ierr = MatGetRowUpperTriangular(matis->A);CHKERRQ(ierr);
  }
  /*
     An SF reduce is needed to sum up properly on shared rows.
     Note that generally preallocation is not exact, since it overestimates nonzeros
  */
  if (!matis->sf) { /* setup SF if not yet created and allocate rootdata and leafdata */
    ierr = MatISComputeSF_Private(A);CHKERRQ(ierr);
  }
  ierr = MatGetLocalSize(A,&lrows,&lcols);CHKERRQ(ierr);
  ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)A),lrows,lcols,dnz,onz);CHKERRQ(ierr);
  /* All processes need to compute entire row ownership */
  ierr = PetscMalloc1(rows,&row_ownership);CHKERRQ(ierr);
  ierr = MatGetOwnershipRanges(A,(const PetscInt**)&mat_ranges);CHKERRQ(ierr);
  for (i=0;i<nsubdomains;i++) {
    for (j=mat_ranges[i];j<mat_ranges[i+1];j++) {
      row_ownership[j] = i;
    }
  }

  /*
     my_dnz and my_onz contains exact contribution to preallocation from each local mat
     then, they will be summed up properly. This way, preallocation is always sufficient
  */
  ierr = PetscCalloc2(local_rows,&my_dnz,local_rows,&my_onz);CHKERRQ(ierr);
  /* preallocation as a MATAIJ */
  if (isdense) { /* special case for dense local matrices */
    for (i=0;i<local_rows;i++) {
      PetscInt index_row = global_indices_r[i];
      for (j=i;j<local_rows;j++) {
        PetscInt owner = row_ownership[index_row];
        PetscInt index_col = global_indices_c[j];
        if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */
          my_dnz[i] += 1;
        } else { /* offdiag block */
          my_onz[i] += 1;
        }
        /* same as before, interchanging rows and cols */
        if (i != j) {
          owner = row_ownership[index_col];
          if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) {
            my_dnz[j] += 1;
          } else {
            my_onz[j] += 1;
          }
        }
      }
    }
  } else { /* TODO: this could be optimized using MatGetRowIJ */
    for (i=0;i<local_rows;i++) {
      const PetscInt *cols;
      PetscInt       ncols,index_row = global_indices_r[i];
      ierr = MatGetRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr);
      for (j=0;j<ncols;j++) {
        PetscInt owner = row_ownership[index_row];
        PetscInt index_col = global_indices_c[cols[j]];
        if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */
          my_dnz[i] += 1;
        } else { /* offdiag block */
          my_onz[i] += 1;
        }
        /* same as before, interchanging rows and cols */
        if (issbaij && index_col != index_row) {
          owner = row_ownership[index_col];
          if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) {
            my_dnz[cols[j]] += 1;
          } else {
            my_onz[cols[j]] += 1;
          }
        }
      }
      ierr = MatRestoreRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr);
    }
  }
  ierr = ISLocalToGlobalMappingRestoreIndices(A->rmap->mapping,&global_indices_r);CHKERRQ(ierr);
  if (global_indices_c != global_indices_r) {
    ierr = ISLocalToGlobalMappingRestoreIndices(A->rmap->mapping,&global_indices_c);CHKERRQ(ierr);
  }
  ierr = PetscFree(row_ownership);CHKERRQ(ierr);

  /* Reduce my_dnz and my_onz */
  if (maxreduce) {
    ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_dnz,dnz,MPI_MAX);CHKERRQ(ierr);
    ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_dnz,dnz,MPI_MAX);CHKERRQ(ierr);
    ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_onz,onz,MPI_MAX);CHKERRQ(ierr);
    ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_onz,onz,MPI_MAX);CHKERRQ(ierr);
  } else {
    ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_dnz,dnz,MPI_SUM);CHKERRQ(ierr);
    ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_dnz,dnz,MPI_SUM);CHKERRQ(ierr);
    ierr = PetscSFReduceBegin(matis->sf,MPIU_INT,my_onz,onz,MPI_SUM);CHKERRQ(ierr);
    ierr = PetscSFReduceEnd(matis->sf,MPIU_INT,my_onz,onz,MPI_SUM);CHKERRQ(ierr);
  }
  ierr = PetscFree2(my_dnz,my_onz);CHKERRQ(ierr);

  /* Resize preallocation if overestimated */
  for (i=0;i<lrows;i++) {
    dnz[i] = PetscMin(dnz[i],lcols);
    onz[i] = PetscMin(onz[i],cols-lcols);
  }
  /* set preallocation */
  ierr = MatMPIAIJSetPreallocation(B,0,dnz,0,onz);CHKERRQ(ierr);
  for (i=0;i<lrows/bs;i++) {
    dnz[i] = dnz[i*bs]/bs;
    onz[i] = onz[i*bs]/bs;
  }
  ierr = MatMPIBAIJSetPreallocation(B,bs,0,dnz,0,onz);CHKERRQ(ierr);
  ierr = MatMPISBAIJSetPreallocation(B,bs,0,dnz,0,onz);CHKERRQ(ierr);
  ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
  if (issbaij) {
    ierr = MatRestoreRowUpperTriangular(matis->A);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
Beispiel #2
0
PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
{
  PetscErrorCode      ierr;
  Mat                 Cmpi;
  Mat_PtAPMPI         *ptap;
  PetscFreeSpaceList  free_space=NULL,current_space=NULL;
  Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
  Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
  Mat_SeqAIJ          *p_loc,*p_oth;
  PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
  PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
  PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
  PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
  PetscBT             lnkbt;
  MPI_Comm            comm;
  PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
  PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
  PetscInt            len,proc,*dnz,*onz,*owners;
  PetscInt            nzi,*pti,*ptj;
  PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
  MPI_Request         *swaits,*rwaits;
  MPI_Status          *sstatus,rstatus;
  Mat_Merge_SeqsToMPI *merge;
  PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
  PetscReal           afill=1.0,afill_tmp;
  PetscInt            rmax;
#if defined(PTAP_PROFILE)
  PetscLogDouble t0,t1,t2,t3,t4;
#endif

  PetscFunctionBegin;
  ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t0);CHKERRQ(ierr);
#endif

  /* check if matrix local sizes are compatible */
  if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
    SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
  }
  if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
    SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
  }

  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);

  /* create struct Mat_PtAPMPI and attached it to C later */
  ierr        = PetscNew(&ptap);CHKERRQ(ierr);
  ierr        = PetscNew(&merge);CHKERRQ(ierr);
  ptap->merge = merge;
  ptap->reuse = MAT_INITIAL_MATRIX;

  /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
  ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);

  /* get P_loc by taking all local rows of P */
  ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);

  p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
  p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
  pi_loc = p_loc->i; pj_loc = p_loc->j;
  pi_oth = p_oth->i; pj_oth = p_oth->j;
#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t1);CHKERRQ(ierr);
#endif

  /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
  /*-------------------------------------------------------------------*/
  ierr   = PetscMalloc1((am+1),&api);CHKERRQ(ierr);
  api[0] = 0;

  /* create and initialize a linked list */
  ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);

  /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
  ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);

  current_space = free_space;

  for (i=0; i<am; i++) {
    /* diagonal portion of A */
    nzi = adi[i+1] - adi[i];
    aj  = ad->j + adi[i];
    for (j=0; j<nzi; j++) {
      row  = aj[j];
      pnz  = pi_loc[row+1] - pi_loc[row];
      Jptr = pj_loc + pi_loc[row];
      /* add non-zero cols of P into the sorted linked list lnk */
      ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
    }
    /* off-diagonal portion of A */
    nzi = aoi[i+1] - aoi[i];
    aj  = ao->j + aoi[i];
    for (j=0; j<nzi; j++) {
      row  = aj[j];
      pnz  = pi_oth[row+1] - pi_oth[row];
      Jptr = pj_oth + pi_oth[row];
      ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
    }
    apnz     = lnk[0];
    api[i+1] = api[i] + apnz;
    if (ap_rmax < apnz) ap_rmax = apnz;

    /* if free space is not available, double the total space in the list */
    if (current_space->local_remaining<apnz) {
      ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
      nspacedouble++;
    }

    /* Copy data into free space, then initialize lnk */
    ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);

    current_space->array           += apnz;
    current_space->local_used      += apnz;
    current_space->local_remaining -= apnz;
  }

  /* Allocate space for apj, initialize apj, and */
  /* destroy list of free space and other temporary array(s) */
  ierr      = PetscMalloc1((api[am]+1),&apj);CHKERRQ(ierr);
  ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
  afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
  if (afill_tmp > afill) afill = afill_tmp;

#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t2);CHKERRQ(ierr);
#endif

  /* determine symbolic Co=(p->B)^T*AP - send to others */
  /*----------------------------------------------------*/
  ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);

  /* then, compute symbolic Co = (p->B)^T*AP */
  pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
                         >= (num of nonzero rows of C_seq) - pn */
  ierr   = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr);
  coi[0] = 0;

  /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
  nnz           = fill*(poti[pon] + api[am]);
  ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
  current_space = free_space;

  for (i=0; i<pon; i++) {
    pnz = poti[i+1] - poti[i];
    ptJ = potj + poti[i];
    for (j=0; j<pnz; j++) {
      row  = ptJ[j]; /* row of AP == col of Pot */
      apnz = api[row+1] - api[row];
      Jptr = apj + api[row];
      /* add non-zero cols of AP into the sorted linked list lnk */
      ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
    }
    nnz = lnk[0];

    /* If free space is not available, double the total space in the list */
    if (current_space->local_remaining<nnz) {
      ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
      nspacedouble++;
    }

    /* Copy data into free space, and zero out denserows */
    ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);

    current_space->array           += nnz;
    current_space->local_used      += nnz;
    current_space->local_remaining -= nnz;

    coi[i+1] = coi[i] + nnz;
  }
  ierr      = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr);
  ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
  afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
  if (afill_tmp > afill) afill = afill_tmp;
  ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);

  /* send j-array (coj) of Co to other processors */
  /*----------------------------------------------*/
  /* determine row ownership */
  ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
  merge->rowmap->n  = pn;
  merge->rowmap->bs = 1;

  ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
  owners = merge->rowmap->range;

  /* determine the number of messages to send, their lengths */
  ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
  ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
  ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);

  len_s        = merge->len_s;
  merge->nsend = 0;

  ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr);

  proc = 0;
  for (i=0; i<pon; i++) {
    while (prmap[i] >= owners[proc+1]) proc++;
    len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
    len_s[proc] += coi[i+1] - coi[i];
  }

  len          = 0; /* max length of buf_si[] */
  owners_co[0] = 0;
  for (proc=0; proc<size; proc++) {
    owners_co[proc+1] = owners_co[proc] + len_si[proc];
    if (len_si[proc]) {
      merge->nsend++;
      len_si[proc] = 2*(len_si[proc] + 1);
      len         += len_si[proc];
    }
  }

  /* determine the number and length of messages to receive for coi and coj  */
  ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
  ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);

  /* post the Irecv and Isend of coj */
  ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
  ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
  ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr);
  for (proc=0, k=0; proc<size; proc++) {
    if (!len_s[proc]) continue;
    i    = owners_co[proc];
    ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
    k++;
  }

  /* receives and sends of coj are complete */
  for (i=0; i<merge->nrecv; i++) {
    ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
  }
  ierr = PetscFree(rwaits);CHKERRQ(ierr);
  if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}

  /* send and recv coi */
  /*-------------------*/
  ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
  ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
  ierr   = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr);
  buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
  for (proc=0,k=0; proc<size; proc++) {
    if (!len_s[proc]) continue;
    /* form outgoing message for i-structure:
         buf_si[0]:                 nrows to be sent
               [1:nrows]:           row index (global)
               [nrows+1:2*nrows+1]: i-structure index
    */
    /*-------------------------------------------*/
    nrows       = len_si[proc]/2 - 1;
    buf_si_i    = buf_si + nrows+1;
    buf_si[0]   = nrows;
    buf_si_i[0] = 0;
    nrows       = 0;
    for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
      nzi = coi[i+1] - coi[i];

      buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
      buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
      nrows++;
    }
    ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
    k++;
    buf_si += len_si[proc];
  }
  i = merge->nrecv;
  while (i--) {
    ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
  }
  ierr = PetscFree(rwaits);CHKERRQ(ierr);
  if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}

  ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
  ierr = PetscFree(len_ri);CHKERRQ(ierr);
  ierr = PetscFree(swaits);CHKERRQ(ierr);
  ierr = PetscFree(buf_s);CHKERRQ(ierr);

#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t3);CHKERRQ(ierr);
#endif

  /* compute the local portion of C (mpi mat) */
  /*------------------------------------------*/
  ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);

  /* allocate pti array and free space for accumulating nonzero column info */
  ierr   = PetscMalloc1((pn+1),&pti);CHKERRQ(ierr);
  pti[0] = 0;

  /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
  nnz           = fill*(pi_loc[pm] + api[am]);
  ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
  current_space = free_space;

  ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
  for (k=0; k<merge->nrecv; k++) {
    buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
    nrows       = *buf_ri_k[k];
    nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
    nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
  }
  ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
  rmax = 0;
  for (i=0; i<pn; i++) {
    /* add pdt[i,:]*AP into lnk */
    pnz = pdti[i+1] - pdti[i];
    ptJ = pdtj + pdti[i];
    for (j=0; j<pnz; j++) {
      row  = ptJ[j];  /* row of AP == col of Pt */
      apnz = api[row+1] - api[row];
      Jptr = apj + api[row];
      /* add non-zero cols of AP into the sorted linked list lnk */
      ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
    }

    /* add received col data into lnk */
    for (k=0; k<merge->nrecv; k++) { /* k-th received message */
      if (i == *nextrow[k]) { /* i-th row */
        nzi  = *(nextci[k]+1) - *nextci[k];
        Jptr = buf_rj[k] + *nextci[k];
        ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
        nextrow[k]++; nextci[k]++;
      }
    }
    nnz = lnk[0];

    /* if free space is not available, make more free space */
    if (current_space->local_remaining<nnz) {
      ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
      nspacedouble++;
    }
    /* copy data into free space, then initialize lnk */
    ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
    ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);

    current_space->array           += nnz;
    current_space->local_used      += nnz;
    current_space->local_remaining -= nnz;

    pti[i+1] = pti[i] + nnz;
    if (nnz > rmax) rmax = nnz;
  }
  ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
  ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);

  ierr      = PetscMalloc1((pti[pn]+1),&ptj);CHKERRQ(ierr);
  ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
  afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
  if (afill_tmp > afill) afill = afill_tmp;
  ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);

  /* create symbolic parallel matrix Cmpi */
  /*--------------------------------------*/
  ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
  ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
  ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,P->cmap->bs);CHKERRQ(ierr);
  ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
  ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
  ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);

  merge->bi        = pti;      /* Cseq->i */
  merge->bj        = ptj;      /* Cseq->j */
  merge->coi       = coi;      /* Co->i   */
  merge->coj       = coj;      /* Co->j   */
  merge->buf_ri    = buf_ri;
  merge->buf_rj    = buf_rj;
  merge->owners_co = owners_co;
  merge->destroy   = Cmpi->ops->destroy;
  merge->duplicate = Cmpi->ops->duplicate;

  /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
  Cmpi->assembled      = PETSC_FALSE;
  Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
  Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;

  /* attach the supporting struct to Cmpi for reuse */
  c           = (Mat_MPIAIJ*)Cmpi->data;
  c->ptap     = ptap;
  ptap->api   = api;
  ptap->apj   = apj;
  ptap->rmax  = ap_rmax;
  *C          = Cmpi;

  /* flag 'scalable' determines which implementations to be used:
       0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
       1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
  /* set default scalable */
  ptap->scalable = PETSC_TRUE;

  ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
  if (!ptap->scalable) {  /* Do dense axpy */
    ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
  } else {
    ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
  }

#if defined(PTAP_PROFILE)
  ierr = PetscTime(&t4);CHKERRQ(ierr);
  if (rank==1) PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPSymbolic %g/P + %g/AP + %g/comm + %g/PtAP = %g\n",rank,t1-t0,t2-t1,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
#endif

#if defined(PETSC_USE_INFO)
  if (pti[pn] != 0) {
    ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
    ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
  } else {
    ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
  }
#endif
  PetscFunctionReturn(0);
}
Beispiel #3
0
static PetscErrorCode DMCreateMatrix_Composite_AIJ(DM dm,MatType mtype,Mat *J)
{
  PetscErrorCode         ierr;
  DM_Composite           *com = (DM_Composite*)dm->data;
  struct DMCompositeLink *next;
  PetscInt               m,*dnz,*onz,i,j,mA;
  Mat                    Atmp;
  PetscMPIInt            rank;
  PetscBool              dense = PETSC_FALSE;

  PetscFunctionBegin;
  /* use global vector to determine layout needed for matrix */
  m = com->n;

  ierr = MatCreate(PetscObjectComm((PetscObject)dm),J);CHKERRQ(ierr);
  ierr = MatSetSizes(*J,m,m,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
  ierr = MatSetType(*J,mtype);CHKERRQ(ierr);

  /*
     Extremely inefficient but will compute entire Jacobian for testing
  */
  ierr = PetscOptionsGetBool(((PetscObject)dm)->prefix,"-dmcomposite_dense_jacobian",&dense,NULL);CHKERRQ(ierr);
  if (dense) {
    PetscInt    rstart,rend,*indices;
    PetscScalar *values;

    mA   = com->N;
    ierr = MatMPIAIJSetPreallocation(*J,mA,NULL,mA-m,NULL);CHKERRQ(ierr);
    ierr = MatSeqAIJSetPreallocation(*J,mA,NULL);CHKERRQ(ierr);

    ierr = MatGetOwnershipRange(*J,&rstart,&rend);CHKERRQ(ierr);
    ierr = PetscMalloc2(mA,PetscScalar,&values,mA,PetscInt,&indices);CHKERRQ(ierr);
    ierr = PetscMemzero(values,mA*sizeof(PetscScalar));CHKERRQ(ierr);
    for (i=0; i<mA; i++) indices[i] = i;
    for (i=rstart; i<rend; i++) {
      ierr = MatSetValues(*J,1,&i,mA,indices,values,INSERT_VALUES);CHKERRQ(ierr);
    }
    ierr = PetscFree2(values,indices);CHKERRQ(ierr);
    ierr = MatAssemblyBegin(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
    ierr = MatAssemblyEnd(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
    PetscFunctionReturn(0);
  }

  ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)dm),&rank);CHKERRQ(ierr);
  ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)dm),m,m,dnz,onz);CHKERRQ(ierr);
  /* loop over packed objects, handling one at at time */
  next = com->next;
  while (next) {
    PetscInt       nc,rstart,*ccols,maxnc;
    const PetscInt *cols,*rstarts;
    PetscMPIInt    proc;

    ierr = DMCreateMatrix(next->dm,mtype,&Atmp);CHKERRQ(ierr);
    ierr = MatGetOwnershipRange(Atmp,&rstart,NULL);CHKERRQ(ierr);
    ierr = MatGetOwnershipRanges(Atmp,&rstarts);CHKERRQ(ierr);
    ierr = MatGetLocalSize(Atmp,&mA,NULL);CHKERRQ(ierr);

    maxnc = 0;
    for (i=0; i<mA; i++) {
      ierr  = MatGetRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr);
      ierr  = MatRestoreRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr);
      maxnc = PetscMax(nc,maxnc);
    }
    ierr = PetscMalloc(maxnc*sizeof(PetscInt),&ccols);CHKERRQ(ierr);
    for (i=0; i<mA; i++) {
      ierr = MatGetRow(Atmp,rstart+i,&nc,&cols,NULL);CHKERRQ(ierr);
      /* remap the columns taking into how much they are shifted on each process */
      for (j=0; j<nc; j++) {
        proc = 0;
        while (cols[j] >= rstarts[proc+1]) proc++;
        ccols[j] = cols[j] + next->grstarts[proc] - rstarts[proc];
      }
      ierr = MatPreallocateSet(com->rstart+next->rstart+i,nc,ccols,dnz,onz);CHKERRQ(ierr);
      ierr = MatRestoreRow(Atmp,rstart+i,&nc,&cols,NULL);CHKERRQ(ierr);
    }
    ierr = PetscFree(ccols);CHKERRQ(ierr);
    ierr = MatDestroy(&Atmp);CHKERRQ(ierr);
    next = next->next;
  }
  if (com->FormCoupleLocations) {
    ierr = (*com->FormCoupleLocations)(dm,NULL,dnz,onz,__rstart,__nrows,__start,__end);CHKERRQ(ierr);
  }
  ierr = MatMPIAIJSetPreallocation(*J,0,dnz,0,onz);CHKERRQ(ierr);
  ierr = MatSeqAIJSetPreallocation(*J,0,dnz);CHKERRQ(ierr);
  ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);

  if (dm->prealloc_only) PetscFunctionReturn(0);

  next = com->next;
  while (next) {
    PetscInt          nc,rstart,row,maxnc,*ccols;
    const PetscInt    *cols,*rstarts;
    const PetscScalar *values;
    PetscMPIInt       proc;

    ierr  = DMCreateMatrix(next->dm,mtype,&Atmp);CHKERRQ(ierr);
    ierr  = MatGetOwnershipRange(Atmp,&rstart,NULL);CHKERRQ(ierr);
    ierr  = MatGetOwnershipRanges(Atmp,&rstarts);CHKERRQ(ierr);
    ierr  = MatGetLocalSize(Atmp,&mA,NULL);CHKERRQ(ierr);
    maxnc = 0;
    for (i=0; i<mA; i++) {
      ierr  = MatGetRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr);
      ierr  = MatRestoreRow(Atmp,rstart+i,&nc,NULL,NULL);CHKERRQ(ierr);
      maxnc = PetscMax(nc,maxnc);
    }
    ierr = PetscMalloc(maxnc*sizeof(PetscInt),&ccols);CHKERRQ(ierr);
    for (i=0; i<mA; i++) {
      ierr = MatGetRow(Atmp,rstart+i,&nc,(const PetscInt**)&cols,&values);CHKERRQ(ierr);
      for (j=0; j<nc; j++) {
        proc = 0;
        while (cols[j] >= rstarts[proc+1]) proc++;
        ccols[j] = cols[j] + next->grstarts[proc] - rstarts[proc];
      }
      row  = com->rstart+next->rstart+i;
      ierr = MatSetValues(*J,1,&row,nc,ccols,values,INSERT_VALUES);CHKERRQ(ierr);
      ierr = MatRestoreRow(Atmp,rstart+i,&nc,(const PetscInt**)&cols,&values);CHKERRQ(ierr);
    }
    ierr = PetscFree(ccols);CHKERRQ(ierr);
    ierr = MatDestroy(&Atmp);CHKERRQ(ierr);
    next = next->next;
  }
  if (com->FormCoupleLocations) {
    PetscInt __rstart;
    ierr = MatGetOwnershipRange(*J,&__rstart,NULL);CHKERRQ(ierr);
    ierr = (*com->FormCoupleLocations)(dm,*J,NULL,NULL,__rstart,0,0,0);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Beispiel #4
0
PetscErrorCode MatISGetMPIXAIJ_IS(Mat mat, MatReuse reuse, Mat *M)
{
  Mat_IS                 *matis = (Mat_IS*)(mat->data);
  /* info on mat */
  /* ISLocalToGlobalMapping rmapping,cmapping; */
  PetscInt               bs,rows,cols;
  PetscInt               lrows,lcols;
  PetscInt               local_rows,local_cols;
  PetscBool              isdense,issbaij,issbaij_red;
  /* values insertion */
  PetscScalar            *array;
  PetscInt               *local_indices,*global_indices;
  /* work */
  PetscInt               i,j,index_row;
  PetscErrorCode         ierr;

  PetscFunctionBegin;
  /* MISSING CHECKS
    - rectangular case not covered (it is not allowed by MATIS)
  */
  /* get info from mat */
  /* ierr = MatGetLocalToGlobalMapping(mat,&rmapping,&cmapping);CHKERRQ(ierr); */
  ierr = MatGetSize(mat,&rows,&cols);CHKERRQ(ierr);
  ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
  ierr = MatGetSize(matis->A,&local_rows,&local_cols);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQDENSE,&isdense);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)matis->A,MATSEQSBAIJ,&issbaij);CHKERRQ(ierr);

  /* work */
  ierr = PetscMalloc1(local_rows,&local_indices);CHKERRQ(ierr);
  for (i=0;i<local_rows;i++) local_indices[i]=i;
  /* map indices of local mat to global values */
  ierr = PetscMalloc(PetscMax(local_cols,local_rows)*sizeof(*global_indices),&global_indices);CHKERRQ(ierr);
  /* ierr = ISLocalToGlobalMappingApply(rmapping,local_rows,local_indices,global_indices);CHKERRQ(ierr); */
  ierr = ISLocalToGlobalMappingApply(matis->mapping,local_rows,local_indices,global_indices);CHKERRQ(ierr);

  if (issbaij) {
    ierr = MatGetRowUpperTriangular(matis->A);CHKERRQ(ierr);
  }

  if (reuse == MAT_INITIAL_MATRIX) {
    Mat         new_mat;
    MatType     new_mat_type;
    Vec         vec_dnz,vec_onz;
    PetscScalar *my_dnz,*my_onz;
    PetscInt    *dnz,*onz,*mat_ranges,*row_ownership;
    PetscInt    index_col,owner;
    PetscMPIInt nsubdomains;

    /* determining new matrix type */
    ierr = MPI_Allreduce(&issbaij,&issbaij_red,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr);
    if (issbaij_red) {
      new_mat_type = MATSBAIJ;
    } else {
      if (bs>1) {
        new_mat_type = MATBAIJ;
      } else {
        new_mat_type = MATAIJ;
      }
    }

    ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&nsubdomains);CHKERRQ(ierr);
    ierr = MatCreate(PetscObjectComm((PetscObject)mat),&new_mat);CHKERRQ(ierr);
    ierr = MatSetSizes(new_mat,PETSC_DECIDE,PETSC_DECIDE,rows,cols);CHKERRQ(ierr);
    ierr = MatSetBlockSize(new_mat,bs);CHKERRQ(ierr);
    ierr = MatSetType(new_mat,new_mat_type);CHKERRQ(ierr);
    ierr = MatSetUp(new_mat);CHKERRQ(ierr);
    ierr = MatGetLocalSize(new_mat,&lrows,&lcols);CHKERRQ(ierr);

    /*
      preallocation
    */

    ierr = MatPreallocateInitialize(PetscObjectComm((PetscObject)new_mat),lrows,lcols,dnz,onz);CHKERRQ(ierr);
    /*
       Some vectors are needed to sum up properly on shared interface dofs.
       Preallocation macros cannot do the job.
       Note that preallocation is not exact, since it overestimates nonzeros
    */
    ierr = MatCreateVecs(new_mat,NULL,&vec_dnz);CHKERRQ(ierr);
    /* ierr = VecSetLocalToGlobalMapping(vec_dnz,rmapping);CHKERRQ(ierr); */
    ierr = VecSetLocalToGlobalMapping(vec_dnz,matis->mapping);CHKERRQ(ierr);
    ierr = VecDuplicate(vec_dnz,&vec_onz);CHKERRQ(ierr);
    /* All processes need to compute entire row ownership */
    ierr = PetscMalloc1(rows,&row_ownership);CHKERRQ(ierr);
    ierr = MatGetOwnershipRanges(new_mat,(const PetscInt**)&mat_ranges);CHKERRQ(ierr);
    for (i=0;i<nsubdomains;i++) {
      for (j=mat_ranges[i];j<mat_ranges[i+1];j++) {
        row_ownership[j]=i;
      }
    }

    /*
       my_dnz and my_onz contains exact contribution to preallocation from each local mat
       then, they will be summed up properly. This way, preallocation is always sufficient
    */
    ierr = PetscMalloc1(local_rows,&my_dnz);CHKERRQ(ierr);
    ierr = PetscMalloc1(local_rows,&my_onz);CHKERRQ(ierr);
    ierr = PetscMemzero(my_dnz,local_rows*sizeof(*my_dnz));CHKERRQ(ierr);
    ierr = PetscMemzero(my_onz,local_rows*sizeof(*my_onz));CHKERRQ(ierr);
    /* preallocation as a MATAIJ */
    if (isdense) { /* special case for dense local matrices */
      for (i=0;i<local_rows;i++) {
        index_row = global_indices[i];
        for (j=i;j<local_rows;j++) {
          owner = row_ownership[index_row];
          index_col = global_indices[j];
          if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */
            my_dnz[i] += 1.0;
          } else { /* offdiag block */
            my_onz[i] += 1.0;
          }
          /* same as before, interchanging rows and cols */
          if (i != j) {
            owner = row_ownership[index_col];
            if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) {
              my_dnz[j] += 1.0;
            } else {
              my_onz[j] += 1.0;
            }
          }
        }
      }
    } else {
      for (i=0;i<local_rows;i++) {
        PetscInt ncols;
        const PetscInt *cols;
        index_row = global_indices[i];
        ierr = MatGetRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr);
        for (j=0;j<ncols;j++) {
          owner = row_ownership[index_row];
          index_col = global_indices[cols[j]];
          if (index_col > mat_ranges[owner]-1 && index_col < mat_ranges[owner+1] ) { /* diag block */
            my_dnz[i] += 1.0;
          } else { /* offdiag block */
            my_onz[i] += 1.0;
          }
          /* same as before, interchanging rows and cols */
          if (issbaij) {
            owner = row_ownership[index_col];
            if (index_row > mat_ranges[owner]-1 && index_row < mat_ranges[owner+1] ) {
              my_dnz[j] += 1.0;
            } else {
              my_onz[j] += 1.0;
            }
          }
        }
        ierr = MatRestoreRow(matis->A,i,&ncols,&cols,NULL);CHKERRQ(ierr);
      }
    }
    ierr = VecSet(vec_dnz,0.0);CHKERRQ(ierr);
    ierr = VecSet(vec_onz,0.0);CHKERRQ(ierr);
    if (local_rows) { /* multilevel guard */
      ierr = VecSetValuesLocal(vec_dnz,local_rows,local_indices,my_dnz,ADD_VALUES);CHKERRQ(ierr);
      ierr = VecSetValuesLocal(vec_onz,local_rows,local_indices,my_onz,ADD_VALUES);CHKERRQ(ierr);
    }
    ierr = VecAssemblyBegin(vec_dnz);CHKERRQ(ierr);
    ierr = VecAssemblyBegin(vec_onz);CHKERRQ(ierr);
    ierr = VecAssemblyEnd(vec_dnz);CHKERRQ(ierr);
    ierr = VecAssemblyEnd(vec_onz);CHKERRQ(ierr);
    ierr = PetscFree(my_dnz);CHKERRQ(ierr);
    ierr = PetscFree(my_onz);CHKERRQ(ierr);
    ierr = PetscFree(row_ownership);CHKERRQ(ierr);

    /* set computed preallocation in dnz and onz */
    ierr = VecGetArray(vec_dnz,&array);CHKERRQ(ierr);
    for (i=0; i<lrows; i++) dnz[i] = (PetscInt)PetscRealPart(array[i]);
    ierr = VecRestoreArray(vec_dnz,&array);CHKERRQ(ierr);
    ierr = VecGetArray(vec_onz,&array);CHKERRQ(ierr);
    for (i=0;i<lrows;i++) onz[i] = (PetscInt)PetscRealPart(array[i]);
    ierr = VecRestoreArray(vec_onz,&array);CHKERRQ(ierr);
    ierr = VecDestroy(&vec_dnz);CHKERRQ(ierr);
    ierr = VecDestroy(&vec_onz);CHKERRQ(ierr);

    /* Resize preallocation if overestimated */
    for (i=0;i<lrows;i++) {
      dnz[i] = PetscMin(dnz[i],lcols);
      onz[i] = PetscMin(onz[i],cols-lcols);
    }
    /* set preallocation */
    ierr = MatMPIAIJSetPreallocation(new_mat,0,dnz,0,onz);CHKERRQ(ierr);
    for (i=0;i<lrows/bs;i++) {
      dnz[i] = dnz[i*bs]/bs;
      onz[i] = onz[i*bs]/bs;
    }
    ierr = MatMPIBAIJSetPreallocation(new_mat,bs,0,dnz,0,onz);CHKERRQ(ierr);
    for (i=0;i<lrows/bs;i++) {
      dnz[i] = dnz[i]-i;
    }
    ierr = MatMPISBAIJSetPreallocation(new_mat,bs,0,dnz,0,onz);CHKERRQ(ierr);
    ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
    *M = new_mat;
  } else {
    PetscInt mbs,mrows,mcols;
    /* some checks */
    ierr = MatGetBlockSize(*M,&mbs);CHKERRQ(ierr);
    ierr = MatGetSize(*M,&mrows,&mcols);CHKERRQ(ierr);
    if (mrows != rows) {
      SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong number of rows (%d != %d)",rows,mrows);
    }
    if (mrows != rows) {
      SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong number of cols (%d != %d)",cols,mcols);
    }
    if (mbs != bs) {
      SETERRQ2(PetscObjectComm((PetscObject)mat),PETSC_ERR_SUP,"Cannot reuse matrix. Wrong block size (%d != %d)",bs,mbs);
    }
    ierr = MatZeroEntries(*M);CHKERRQ(ierr);
  }
  /* set local to global mappings */
  /* ierr = MatSetLocalToGlobalMapping(*M,rmapping,cmapping);CHKERRQ(ierr); */
  /* Set values */
  if (isdense) { /* special case for dense local matrices */
    ierr = MatSetOption(*M,MAT_ROW_ORIENTED,PETSC_FALSE);CHKERRQ(ierr);
    ierr = MatDenseGetArray(matis->A,&array);CHKERRQ(ierr);
    ierr = MatSetValues(*M,local_rows,global_indices,local_cols,global_indices,array,ADD_VALUES);CHKERRQ(ierr);
    ierr = MatDenseRestoreArray(matis->A,&array);CHKERRQ(ierr);
    ierr = PetscFree(local_indices);CHKERRQ(ierr);
    ierr = PetscFree(global_indices);CHKERRQ(ierr);
  } else { /* very basic values insertion for all other matrix types */
    ierr = PetscFree(local_indices);CHKERRQ(ierr);
    for (i=0;i<local_rows;i++) {
      ierr = MatGetRow(matis->A,i,&j,(const PetscInt**)&local_indices,(const PetscScalar**)&array);CHKERRQ(ierr);
      /* ierr = MatSetValuesLocal(*M,1,&i,j,local_indices,array,ADD_VALUES);CHKERRQ(ierr); */
      ierr = ISLocalToGlobalMappingApply(matis->mapping,j,local_indices,global_indices);CHKERRQ(ierr);
      ierr = ISLocalToGlobalMappingApply(matis->mapping,1,&i,&index_row);CHKERRQ(ierr);
      ierr = MatSetValues(*M,1,&index_row,j,global_indices,array,ADD_VALUES);CHKERRQ(ierr);
      ierr = MatRestoreRow(matis->A,i,&j,(const PetscInt**)&local_indices,(const PetscScalar**)&array);CHKERRQ(ierr);
    }
    ierr = PetscFree(global_indices);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(*M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  if (isdense) {
    ierr = MatSetOption(*M,MAT_ROW_ORIENTED,PETSC_TRUE);CHKERRQ(ierr);
  }
  if (issbaij) {
    ierr = MatRestoreRowUpperTriangular(matis->A);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}