Exemple #1
0
/*@C
   PCGAMGFilterGraph - filter (remove zero and possibly small values from the) graph and make it symmetric if requested

   Collective on Mat

   Input Parameter:
+   a_Gmat - the graph
.   vfilter - threshold paramter [0,1)
-   symm - make the result symmetric

   Level: developer

   Notes:
    This is called before graph coarsers are called.

.seealso: PCGAMGSetThreshold()
@*/
PetscErrorCode PCGAMGFilterGraph(Mat *a_Gmat,PetscReal vfilter,PetscBool symm)
{
  PetscErrorCode    ierr;
  PetscInt          Istart,Iend,Ii,jj,ncols,nnz0,nnz1, NN, MM, nloc;
  PetscMPIInt       rank;
  Mat               Gmat  = *a_Gmat, tGmat, matTrans;
  MPI_Comm          comm;
  const PetscScalar *vals;
  const PetscInt    *idx;
  PetscInt          *d_nnz, *o_nnz;
  Vec               diag;
  MatType           mtype;

  PetscFunctionBegin;
#if defined PETSC_GAMG_USE_LOG
  ierr = PetscLogEventBegin(petsc_gamg_setup_events[GRAPH],0,0,0,0);CHKERRQ(ierr);
#endif
  /* scale Gmat for all values between -1 and 1 */
  ierr = MatCreateVecs(Gmat, &diag, 0);CHKERRQ(ierr);
  ierr = MatGetDiagonal(Gmat, diag);CHKERRQ(ierr);
  ierr = VecReciprocal(diag);CHKERRQ(ierr);
  ierr = VecSqrtAbs(diag);CHKERRQ(ierr);
  ierr = MatDiagonalScale(Gmat, diag, diag);CHKERRQ(ierr);
  ierr = VecDestroy(&diag);CHKERRQ(ierr);

  if (vfilter < 0.0 && !symm) {
    /* Just use the provided matrix as the graph but make all values positive */
    MatInfo     info;
    PetscScalar *avals;
    PetscBool isaij,ismpiaij;
    ierr = PetscObjectBaseTypeCompare((PetscObject)Gmat,MATSEQAIJ,&isaij);CHKERRQ(ierr);
    ierr = PetscObjectBaseTypeCompare((PetscObject)Gmat,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
    if (!isaij && !ismpiaij) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_USER,"Require (MPI)AIJ matrix type");
    if (isaij) {
      ierr = MatGetInfo(Gmat,MAT_LOCAL,&info);CHKERRQ(ierr);
      ierr = MatSeqAIJGetArray(Gmat,&avals);CHKERRQ(ierr);
      for (jj = 0; jj<info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
      ierr = MatSeqAIJRestoreArray(Gmat,&avals);CHKERRQ(ierr);
    } else {
      Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)Gmat->data;
      ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
      ierr = MatSeqAIJGetArray(aij->A,&avals);CHKERRQ(ierr);
      for (jj = 0; jj<info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
      ierr = MatSeqAIJRestoreArray(aij->A,&avals);CHKERRQ(ierr);
      ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
      ierr = MatSeqAIJGetArray(aij->B,&avals);CHKERRQ(ierr);
      for (jj = 0; jj<info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
      ierr = MatSeqAIJRestoreArray(aij->B,&avals);CHKERRQ(ierr);
    }
#if defined PETSC_GAMG_USE_LOG
    ierr = PetscLogEventEnd(petsc_gamg_setup_events[GRAPH],0,0,0,0);CHKERRQ(ierr);
#endif
    PetscFunctionReturn(0);
  }

  ierr = PetscObjectGetComm((PetscObject)Gmat,&comm);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
  ierr = MatGetOwnershipRange(Gmat, &Istart, &Iend);CHKERRQ(ierr);
  nloc = Iend - Istart;
  ierr = MatGetSize(Gmat, &MM, &NN);CHKERRQ(ierr);

  if (symm) {
    ierr = MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans);CHKERRQ(ierr);
  }

  /* Determine upper bound on nonzeros needed in new filtered matrix */
  ierr = PetscMalloc2(nloc, &d_nnz,nloc, &o_nnz);CHKERRQ(ierr);
  for (Ii = Istart, jj = 0; Ii < Iend; Ii++, jj++) {
    ierr      = MatGetRow(Gmat,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    d_nnz[jj] = ncols;
    o_nnz[jj] = ncols;
    ierr      = MatRestoreRow(Gmat,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    if (symm) {
      ierr       = MatGetRow(matTrans,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
      d_nnz[jj] += ncols;
      o_nnz[jj] += ncols;
      ierr       = MatRestoreRow(matTrans,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    }
    if (d_nnz[jj] > nloc) d_nnz[jj] = nloc;
    if (o_nnz[jj] > (MM-nloc)) o_nnz[jj] = MM - nloc;
  }
  ierr = MatGetType(Gmat,&mtype);CHKERRQ(ierr);
  ierr = MatCreate(comm, &tGmat);CHKERRQ(ierr);
  ierr = MatSetSizes(tGmat,nloc,nloc,MM,MM);CHKERRQ(ierr);
  ierr = MatSetBlockSizes(tGmat, 1, 1);CHKERRQ(ierr);
  ierr = MatSetType(tGmat, mtype);CHKERRQ(ierr);
  ierr = MatSeqAIJSetPreallocation(tGmat,0,d_nnz);CHKERRQ(ierr);
  ierr = MatMPIAIJSetPreallocation(tGmat,0,d_nnz,0,o_nnz);CHKERRQ(ierr);
  ierr = PetscFree2(d_nnz,o_nnz);CHKERRQ(ierr);
  if (symm) {
    ierr = MatDestroy(&matTrans);CHKERRQ(ierr);
  } else {
    /* all entries are generated locally so MatAssembly will be slightly faster for large process counts */
    ierr = MatSetOption(tGmat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
  }

  for (Ii = Istart, nnz0 = nnz1 = 0; Ii < Iend; Ii++) {
    ierr = MatGetRow(Gmat,Ii,&ncols,&idx,&vals);CHKERRQ(ierr);
    for (jj=0; jj<ncols; jj++,nnz0++) {
      PetscScalar sv = PetscAbs(PetscRealPart(vals[jj]));
      if (PetscRealPart(sv) > vfilter) {
        nnz1++;
        if (symm) {
          sv  *= 0.5;
          ierr = MatSetValues(tGmat,1,&Ii,1,&idx[jj],&sv,ADD_VALUES);CHKERRQ(ierr);
          ierr = MatSetValues(tGmat,1,&idx[jj],1,&Ii,&sv,ADD_VALUES);CHKERRQ(ierr);
        } else {
          ierr = MatSetValues(tGmat,1,&Ii,1,&idx[jj],&sv,ADD_VALUES);CHKERRQ(ierr);
        }
      }
    }
    ierr = MatRestoreRow(Gmat,Ii,&ncols,&idx,&vals);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(tGmat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(tGmat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

#if defined PETSC_GAMG_USE_LOG
  ierr = PetscLogEventEnd(petsc_gamg_setup_events[GRAPH],0,0,0,0);CHKERRQ(ierr);
#endif

#if defined(PETSC_USE_INFO)
  {
    double t1 = (!nnz0) ? 1. : 100.*(double)nnz1/(double)nnz0, t2 = (!nloc) ? 1. : (double)nnz0/(double)nloc;
    ierr = PetscInfo4(*a_Gmat,"\t %g%% nnz after filtering, with threshold %g, %g nnz ave. (N=%D)\n",t1,vfilter,t2,MM);CHKERRQ(ierr);
  }
#endif
  ierr    = MatDestroy(&Gmat);CHKERRQ(ierr);
  *a_Gmat = tGmat;
  PetscFunctionReturn(0);
}
Exemple #2
0
PetscErrorCode PCGAMGFilterGraph(Mat *a_Gmat,const PetscReal vfilter,const PetscBool symm,const PetscInt verbose)
{
  PetscErrorCode    ierr;
  PetscInt          Istart,Iend,Ii,jj,ncols,nnz0,nnz1, NN, MM, nloc;
  PetscMPIInt       rank, size;
  Mat               Gmat  = *a_Gmat, tGmat, matTrans;
  MPI_Comm          comm;
  const PetscScalar *vals;
  const PetscInt    *idx;
  PetscInt          *d_nnz, *o_nnz;
  Vec               diag;

  PetscFunctionBegin;
  ierr = PetscObjectGetComm((PetscObject)Gmat,&comm);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = MatGetOwnershipRange(Gmat, &Istart, &Iend);CHKERRQ(ierr);
  nloc = Iend - Istart;
  ierr = MatGetSize(Gmat, &MM, &NN);CHKERRQ(ierr);
#if defined PETSC_GAMG_USE_LOG
  ierr = PetscLogEventBegin(petsc_gamg_setup_events[GRAPH],0,0,0,0);CHKERRQ(ierr);
#endif
  /* scale Gmat so filter works */
  ierr = MatGetVecs(Gmat, &diag, 0);CHKERRQ(ierr);
  ierr = MatGetDiagonal(Gmat, diag);CHKERRQ(ierr);
  ierr = VecReciprocal(diag);CHKERRQ(ierr);
  ierr = VecSqrtAbs(diag);CHKERRQ(ierr);
  ierr = MatDiagonalScale(Gmat, diag, diag);CHKERRQ(ierr);
  ierr = VecDestroy(&diag);CHKERRQ(ierr);

  if (symm) {
    ierr = MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans);CHKERRQ(ierr);
  }

  /* filter - dup zeros out matrix */
  ierr = PetscMalloc1(nloc, &d_nnz);CHKERRQ(ierr);
  ierr = PetscMalloc1(nloc, &o_nnz);CHKERRQ(ierr);
  for (Ii = Istart, jj = 0; Ii < Iend; Ii++, jj++) {
    ierr      = MatGetRow(Gmat,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    d_nnz[jj] = ncols;
    o_nnz[jj] = ncols;
    ierr      = MatRestoreRow(Gmat,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    if (symm) {
      ierr       = MatGetRow(matTrans,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
      d_nnz[jj] += ncols;
      o_nnz[jj] += ncols;
      ierr       = MatRestoreRow(matTrans,Ii,&ncols,NULL,NULL);CHKERRQ(ierr);
    }
    if (d_nnz[jj] > nloc) d_nnz[jj] = nloc;
    if (o_nnz[jj] > (MM-nloc)) o_nnz[jj] = MM - nloc;
  }
  ierr = MatCreateAIJ(comm, nloc, nloc, MM, MM, 0, d_nnz, 0, o_nnz, &tGmat);CHKERRQ(ierr);
  ierr = PetscFree(d_nnz);CHKERRQ(ierr);
  ierr = PetscFree(o_nnz);CHKERRQ(ierr);
  if (symm) {
    ierr = MatDestroy(&matTrans);CHKERRQ(ierr);
  }

  for (Ii = Istart, nnz0 = nnz1 = 0; Ii < Iend; Ii++) {
    ierr = MatGetRow(Gmat,Ii,&ncols,&idx,&vals);CHKERRQ(ierr);
    for (jj=0; jj<ncols; jj++,nnz0++) {
      PetscScalar sv = PetscAbs(PetscRealPart(vals[jj]));
      if (PetscRealPart(sv) > vfilter) {
        nnz1++;
        if (symm) {
          sv  *= 0.5;
          ierr = MatSetValues(tGmat,1,&Ii,1,&idx[jj],&sv,ADD_VALUES);CHKERRQ(ierr);
          ierr = MatSetValues(tGmat,1,&idx[jj],1,&Ii,&sv,ADD_VALUES);CHKERRQ(ierr);
        } else {
          ierr = MatSetValues(tGmat,1,&Ii,1,&idx[jj],&sv,ADD_VALUES);CHKERRQ(ierr);
        }
      }
    }
    ierr = MatRestoreRow(Gmat,Ii,&ncols,&idx,&vals);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(tGmat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(tGmat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

#if defined PETSC_GAMG_USE_LOG
  ierr = PetscLogEventEnd(petsc_gamg_setup_events[GRAPH],0,0,0,0);CHKERRQ(ierr);
#endif

  if (verbose) {
    if (verbose == 1) {
      ierr = PetscPrintf(comm,"\t[%d]%s %g%% nnz after filtering, with threshold %g, %g nnz ave. (N=%d)\n",rank,__FUNCT__,
                         100.*(double)nnz1/(double)nnz0,vfilter,(double)nnz0/(double)nloc,MM);CHKERRQ(ierr);
    } else {
      PetscInt nnz[2],out[2];
      nnz[0] = nnz0; nnz[1] = nnz1;
      ierr   = MPI_Allreduce(nnz, out, 2, MPIU_INT, MPI_SUM, comm);CHKERRQ(ierr);
      ierr   = PetscPrintf(comm,"\t[%d]%s %g%% nnz after filtering, with threshold %g, %g nnz ave. (N=%d)\n",rank,__FUNCT__,
                           100.*(double)out[1]/(double)out[0],vfilter,(double)out[0]/(double)MM,MM);CHKERRQ(ierr);
    }
  }
  ierr    = MatDestroy(&Gmat);CHKERRQ(ierr);
  *a_Gmat = tGmat;
  PetscFunctionReturn(0);
}
Exemple #3
0
static PetscErrorCode KSPSolve_LSQR(KSP ksp)
{
  PetscErrorCode ierr;
  PetscInt       i,size1,size2;
  PetscScalar    rho,rhobar,phi,phibar,theta,c,s,tmp,tau;
  PetscReal      beta,alpha,rnorm;
  Vec            X,B,V,V1,U,U1,TMP,W,W2,SE,Z = NULL;
  Mat            Amat,Pmat;
  MatStructure   pflag;
  KSP_LSQR       *lsqr = (KSP_LSQR*)ksp->data;
  PetscBool      diagonalscale,nopreconditioner;

  PetscFunctionBegin;
  ierr = PCGetDiagonalScale(ksp->pc,&diagonalscale);CHKERRQ(ierr);
  if (diagonalscale) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"Krylov method %s does not support diagonal scaling",((PetscObject)ksp)->type_name);

  ierr = PCGetOperators(ksp->pc,&Amat,&Pmat,&pflag);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)ksp->pc,PCNONE,&nopreconditioner);CHKERRQ(ierr);

  /*  nopreconditioner =PETSC_FALSE; */
  /* Calculate norm of right hand side */
  ierr = VecNorm(ksp->vec_rhs,NORM_2,&lsqr->rhs_norm);CHKERRQ(ierr);

  /* mark norm of matrix with negative number to indicate it has not yet been computed */
  lsqr->anorm = -1.0;

  /* vectors of length m, where system size is mxn */
  B  = ksp->vec_rhs;
  U  = lsqr->vwork_m[0];
  U1 = lsqr->vwork_m[1];

  /* vectors of length n */
  X  = ksp->vec_sol;
  W  = lsqr->vwork_n[0];
  V  = lsqr->vwork_n[1];
  V1 = lsqr->vwork_n[2];
  W2 = lsqr->vwork_n[3];
  if (!nopreconditioner) Z = lsqr->vwork_n[4];

  /* standard error vector */
  SE = lsqr->se;
  if (SE) {
    ierr = VecGetSize(SE,&size1);CHKERRQ(ierr);
    ierr = VecGetSize(X,&size2);CHKERRQ(ierr);
    if (size1 != size2) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Standard error vector (size %d) does not match solution vector (size %d)",size1,size2);
    ierr = VecSet(SE,0.0);CHKERRQ(ierr);
  }

  /* Compute initial residual, temporarily use work vector u */
  if (!ksp->guess_zero) {
    ierr = KSP_MatMult(ksp,Amat,X,U);CHKERRQ(ierr);       /*   u <- b - Ax     */
    ierr = VecAYPX(U,-1.0,B);CHKERRQ(ierr);
  } else {
    ierr = VecCopy(B,U);CHKERRQ(ierr);            /*   u <- b (x is 0) */
  }

  /* Test for nothing to do */
  ierr       = VecNorm(U,NORM_2,&rnorm);CHKERRQ(ierr);
  ierr       = PetscObjectAMSTakeAccess((PetscObject)ksp);CHKERRQ(ierr);
  ksp->its   = 0;
  ksp->rnorm = rnorm;
  ierr       = PetscObjectAMSGrantAccess((PetscObject)ksp);CHKERRQ(ierr);
  ierr = KSPLogResidualHistory(ksp,rnorm);CHKERRQ(ierr);
  ierr = KSPMonitor(ksp,0,rnorm);CHKERRQ(ierr);
  ierr = (*ksp->converged)(ksp,0,rnorm,&ksp->reason,ksp->cnvP);CHKERRQ(ierr);
  if (ksp->reason) PetscFunctionReturn(0);

  beta = rnorm;
  ierr = VecScale(U,1.0/beta);CHKERRQ(ierr);
  ierr = KSP_MatMultTranspose(ksp,Amat,U,V);CHKERRQ(ierr);
  if (nopreconditioner) {
    ierr = VecNorm(V,NORM_2,&alpha);CHKERRQ(ierr);
  } else {
    ierr = PCApply(ksp->pc,V,Z);CHKERRQ(ierr);
    ierr = VecDotRealPart(V,Z,&alpha);CHKERRQ(ierr);
    if (alpha <= 0.0) {
      ksp->reason = KSP_DIVERGED_BREAKDOWN;
      PetscFunctionReturn(0);
    }
    alpha = PetscSqrtReal(alpha);
    ierr  = VecScale(Z,1.0/alpha);CHKERRQ(ierr);
  }
  ierr = VecScale(V,1.0/alpha);CHKERRQ(ierr);

  if (nopreconditioner) {
    ierr = VecCopy(V,W);CHKERRQ(ierr);
  } else {
    ierr = VecCopy(Z,W);CHKERRQ(ierr);
  }

  lsqr->arnorm = alpha * beta;
  phibar       = beta;
  rhobar       = alpha;
  i            = 0;
  do {
    if (nopreconditioner) {
      ierr = KSP_MatMult(ksp,Amat,V,U1);CHKERRQ(ierr);
    } else {
      ierr = KSP_MatMult(ksp,Amat,Z,U1);CHKERRQ(ierr);
    }
    ierr = VecAXPY(U1,-alpha,U);CHKERRQ(ierr);
    ierr = VecNorm(U1,NORM_2,&beta);CHKERRQ(ierr);
    if (beta == 0.0) {
      ksp->reason = KSP_DIVERGED_BREAKDOWN;
      break;
    }
    ierr = VecScale(U1,1.0/beta);CHKERRQ(ierr);

    ierr = KSP_MatMultTranspose(ksp,Amat,U1,V1);CHKERRQ(ierr);
    ierr = VecAXPY(V1,-beta,V);CHKERRQ(ierr);
    if (nopreconditioner) {
      ierr = VecNorm(V1,NORM_2,&alpha);CHKERRQ(ierr);
    } else {
      ierr = PCApply(ksp->pc,V1,Z);CHKERRQ(ierr);
      ierr = VecDotRealPart(V1,Z,&alpha);CHKERRQ(ierr);
      if (alpha <= 0.0) {
        ksp->reason = KSP_DIVERGED_BREAKDOWN;
        break;
      }
      alpha = PetscSqrtReal(alpha);
      ierr  = VecScale(Z,1.0/alpha);CHKERRQ(ierr);
    }
    ierr   = VecScale(V1,1.0/alpha);CHKERRQ(ierr);
    rho    = PetscSqrtScalar(rhobar*rhobar + beta*beta);
    c      = rhobar / rho;
    s      = beta / rho;
    theta  = s * alpha;
    rhobar = -c * alpha;
    phi    = c * phibar;
    phibar = s * phibar;
    tau    = s * phi;

    ierr = VecAXPY(X,phi/rho,W);CHKERRQ(ierr);  /*    x <- x + (phi/rho) w   */

    if (SE) {
      ierr = VecCopy(W,W2);CHKERRQ(ierr);
      ierr = VecSquare(W2);CHKERRQ(ierr);
      ierr = VecScale(W2,1.0/(rho*rho));CHKERRQ(ierr);
      ierr = VecAXPY(SE, 1.0, W2);CHKERRQ(ierr); /* SE <- SE + (w^2/rho^2) */
    }
    if (nopreconditioner) {
      ierr = VecAYPX(W,-theta/rho,V1);CHKERRQ(ierr);  /* w <- v - (theta/rho) w */
    } else {
      ierr = VecAYPX(W,-theta/rho,Z);CHKERRQ(ierr);   /* w <- z - (theta/rho) w */
    }

    lsqr->arnorm = alpha*PetscAbsScalar(tau);
    rnorm        = PetscRealPart(phibar);

    ierr = PetscObjectAMSTakeAccess((PetscObject)ksp);CHKERRQ(ierr);
    ksp->its++;
    ksp->rnorm = rnorm;
    ierr       = PetscObjectAMSGrantAccess((PetscObject)ksp);CHKERRQ(ierr);
    ierr = KSPLogResidualHistory(ksp,rnorm);CHKERRQ(ierr);
    ierr = KSPMonitor(ksp,i+1,rnorm);CHKERRQ(ierr);
    ierr = (*ksp->converged)(ksp,i+1,rnorm,&ksp->reason,ksp->cnvP);CHKERRQ(ierr);
    if (ksp->reason) break;
    SWAP(U1,U,TMP);
    SWAP(V1,V,TMP);

    i++;
  } while (i<ksp->max_it);
  if (i >= ksp->max_it && !ksp->reason) ksp->reason = KSP_DIVERGED_ITS;

  /* Finish off the standard error estimates */
  if (SE) {
    tmp  = 1.0;
    ierr = MatGetSize(Amat,&size1,&size2);CHKERRQ(ierr);
    if (size1 > size2) tmp = size1 - size2;
    tmp  = rnorm / PetscSqrtScalar(tmp);
    ierr = VecSqrtAbs(SE);CHKERRQ(ierr);
    ierr = VecScale(SE,tmp);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
Exemple #4
0
PetscErrorCode heavyEdgeMatchAgg(IS perm,Mat a_Gmat,PetscInt verbose,PetscCoarsenData **a_locals_llist)
{
  PetscErrorCode   ierr;
  PetscBool        isMPI;
  MPI_Comm         wcomm = ((PetscObject)a_Gmat)->comm;
  PetscInt         sub_it,kk,n,ix,*idx,*ii,iter,Iend,my0;
  PetscMPIInt      rank,size;
  const PetscInt   nloc = a_Gmat->rmap->n,n_iter=6; /* need to figure out how to stop this */
  PetscInt         *lid_cprowID,*lid_gid;
  PetscBool        *lid_matched;
  Mat_SeqAIJ       *matA, *matB=0;
  Mat_MPIAIJ       *mpimat=0;
  PetscScalar      one=1.;
  PetscCoarsenData *agg_llists = PETSC_NULL,*deleted_list = PETSC_NULL;
  Mat              cMat,tMat,P;
  MatScalar        *ap;
  PetscMPIInt      tag1,tag2;

  PetscFunctionBegin;
  ierr = MPI_Comm_rank(wcomm, &rank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(wcomm, &size);CHKERRQ(ierr);
  ierr = MatGetOwnershipRange(a_Gmat, &my0, &Iend);CHKERRQ(ierr);
  ierr = PetscCommGetNewTag(wcomm, &tag1);CHKERRQ(ierr);
  ierr = PetscCommGetNewTag(wcomm, &tag2);CHKERRQ(ierr);

  ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_gid);CHKERRQ(ierr); /* explicit array needed */
  ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_cprowID);CHKERRQ(ierr);
  ierr = PetscMalloc(nloc*sizeof(PetscBool), &lid_matched);CHKERRQ(ierr);

  ierr = PetscCDCreate(nloc, &agg_llists);CHKERRQ(ierr);
  /* ierr = PetscCDSetChuckSize(agg_llists, nloc+1);CHKERRQ(ierr); */
  *a_locals_llist = agg_llists;
  ierr = PetscCDCreate(size, &deleted_list);CHKERRQ(ierr);
  ierr = PetscCDSetChuckSize(deleted_list, 100);CHKERRQ(ierr);
  /* setup 'lid_gid' for scatters and add self to all lists */
  for (kk=0;kk<nloc;kk++) {
    lid_gid[kk] = kk + my0;
    ierr = PetscCDAppendID(agg_llists, kk, my0+kk);CHKERRQ(ierr);
  }

  /* make a copy of the graph, this gets destroyed in iterates */
  ierr = MatDuplicate(a_Gmat,MAT_COPY_VALUES,&cMat);CHKERRQ(ierr);
  ierr = PetscObjectTypeCompare((PetscObject)a_Gmat, MATMPIAIJ, &isMPI);CHKERRQ(ierr);
  iter = 0;
  while(iter++ < n_iter) {
    PetscScalar    *cpcol_gid,*cpcol_max_ew,*cpcol_max_pe,*lid_max_ew;
    PetscBool      *cpcol_matched;
    PetscMPIInt    *cpcol_pe,proc;
    Vec            locMaxEdge,locMaxPE,ghostMaxEdge,ghostMaxPE;
    PetscInt       nEdges,n_nz_row,jj;
    Edge           *Edges;
    PetscInt       gid;
    const PetscInt *perm_ix, n_sub_its = 120;

    /* get submatrices of cMat */
    if (isMPI) {
      mpimat = (Mat_MPIAIJ*)cMat->data;
      matA = (Mat_SeqAIJ*)mpimat->A->data;
      matB = (Mat_SeqAIJ*)mpimat->B->data;
      /* force compressed storage of B */
      matB->compressedrow.check = PETSC_TRUE;
      ierr = MatCheckCompressedRow(mpimat->B,&matB->compressedrow,matB->i,cMat->rmap->n,-1.0);CHKERRQ(ierr);
      assert(matB->compressedrow.use);
    } else {
      matA = (Mat_SeqAIJ*)cMat->data;
    }
    assert(matA && !matA->compressedrow.use);
    assert(matB==0 || matB->compressedrow.use);

    /* set max edge on nodes */
    ierr = MatGetVecs(cMat, &locMaxEdge, 0);CHKERRQ(ierr);
    ierr = MatGetVecs(cMat, &locMaxPE, 0);CHKERRQ(ierr);

    /* get 'cpcol_pe' & 'cpcol_gid' & init. 'cpcol_matched' using 'mpimat->lvec' */
    if (mpimat) {
      Vec         vec; 
      PetscScalar vval;

      ierr = MatGetVecs(cMat, &vec, 0);CHKERRQ(ierr);
      /* cpcol_pe */
      vval = (PetscScalar)(rank);
      for (kk=0,gid=my0;kk<nloc;kk++,gid++) {
        ierr = VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */
      }
      ierr = VecAssemblyBegin(vec);CHKERRQ(ierr);
      ierr = VecAssemblyEnd(vec);CHKERRQ(ierr);
      ierr = VecScatterBegin(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecScatterEnd(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecGetArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); /* get proc ID in 'cpcol_gid' */
      ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr);
      ierr = PetscMalloc(n*sizeof(PetscInt), &cpcol_pe);CHKERRQ(ierr);
      for (kk=0;kk<n;kk++) cpcol_pe[kk] = (PetscMPIInt)PetscRealPart(cpcol_gid[kk]);
      ierr = VecRestoreArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr);

      /* cpcol_gid */
      for (kk=0,gid=my0;kk<nloc;kk++,gid++) {
        vval = (PetscScalar)(gid);
        ierr = VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */
      }
      ierr = VecAssemblyBegin(vec);CHKERRQ(ierr);
      ierr = VecAssemblyEnd(vec);CHKERRQ(ierr);
      ierr = VecScatterBegin(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecScatterEnd(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecDestroy(&vec);CHKERRQ(ierr);
      ierr = VecGetArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); /* get proc ID in 'cpcol_gid' */

      /* cpcol_matched */
      ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr);
      ierr = PetscMalloc(n*sizeof(PetscBool), &cpcol_matched);CHKERRQ(ierr);
      for (kk=0;kk<n;kk++) cpcol_matched[kk] = PETSC_FALSE;
    }

    /* need an inverse map - locals */
    for (kk=0;kk<nloc;kk++) lid_cprowID[kk] = -1;
    /* set index into compressed row 'lid_cprowID' */
    if (matB) {
      ii = matB->compressedrow.i;
      for (ix=0; ix<matB->compressedrow.nrows; ix++) {
        lid_cprowID[matB->compressedrow.rindex[ix]] = ix;
      }
    }

    /* get removed IS, use '' */
    /* if (iter==1) { */
    /*   PetscInt *lid_rem,idx; */
    /*   ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_rem);CHKERRQ(ierr); */
    /*   for (kk=idx=0;kk<nloc;kk++){ */
    /*     PetscInt nn,lid=kk; */
    /*     ii = matA->i; nn = ii[lid+1] - ii[lid]; */
    /*     if ((ix=lid_cprowID[lid]) != -1) { /\* if I have any ghost neighbors *\/ */
    /*       ii = matB->compressedrow.i; */
    /*       nn += ii[ix+1] - ii[ix]; */
    /*     } */
    /*     if (nn < 2) { */
    /*       lid_rem[idx++] = kk + my0; */
    /*     } */
    /*   } */
    /*   ierr = PetscCDSetRemovedIS(agg_llists, wcomm, idx, lid_rem);CHKERRQ(ierr); */
    /*   ierr = PetscFree(lid_rem);CHKERRQ(ierr); */
    /* } */

    /* compute 'locMaxEdge' & 'locMaxPE', and create list of edges, count edges' */
    for (nEdges=0,kk=0,gid=my0;kk<nloc;kk++,gid++){
      PetscReal   max_e = 0., tt;
      PetscScalar vval;
      PetscInt    lid = kk;
      PetscMPIInt max_pe=rank,pe;
      ii = matA->i; n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid];
      ap = matA->a + ii[lid];
      for (jj=0; jj<n; jj++) {
        PetscInt lidj = idx[jj];
        if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]);
        if (lidj > lid) nEdges++;
      }
      if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
        ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
        ap = matB->a + ii[ix];
        idx = matB->j + ii[ix];
        for (jj=0 ; jj<n ; jj++) {
          if ((tt=PetscRealPart(ap[jj])) > max_e) max_e = tt;
          nEdges++;
          if ((pe=cpcol_pe[idx[jj]]) > max_pe) max_pe = pe;
        }
      }
      vval = max_e;
      ierr = VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr);

      vval = (PetscScalar)max_pe;
      ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr);
    }
    ierr = VecAssemblyBegin(locMaxEdge);CHKERRQ(ierr);
    ierr = VecAssemblyEnd(locMaxEdge);CHKERRQ(ierr);
    ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr);
    ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr);

    /* get 'cpcol_max_ew' & 'cpcol_max_pe' */
    if (mpimat) {
      ierr = VecDuplicate(mpimat->lvec, &ghostMaxEdge);CHKERRQ(ierr);
      ierr = VecScatterBegin(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecScatterEnd(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);

      ierr = VecDuplicate(mpimat->lvec, &ghostMaxPE);CHKERRQ(ierr);
      ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr =   VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
      ierr = VecGetArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr);
    }

    /* setup sorted list of edges */
    ierr = PetscMalloc(nEdges*sizeof(Edge), &Edges);CHKERRQ(ierr);
    ierr = ISGetIndices(perm, &perm_ix);CHKERRQ(ierr);
    for (nEdges=n_nz_row=kk=0;kk<nloc;kk++){
      PetscInt nn, lid = perm_ix[kk];
      ii = matA->i; nn = n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid];
      ap = matA->a + ii[lid];
      for (jj=0; jj<n; jj++) {
        PetscInt lidj = idx[jj];        assert(PetscRealPart(ap[jj])>0.);
        if (lidj > lid) {
          Edges[nEdges].lid0 = lid;
          Edges[nEdges].gid1 = lidj + my0;
          Edges[nEdges].cpid1 = -1;
          Edges[nEdges].weight = PetscRealPart(ap[jj]);
          nEdges++;
        }
      }
      if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
        ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
        ap = matB->a + ii[ix];
        idx = matB->j + ii[ix];
        nn += n;
        for (jj=0 ; jj<n ; jj++) {
          assert(PetscRealPart(ap[jj])>0.);
          Edges[nEdges].lid0 = lid;
          Edges[nEdges].gid1 = (PetscInt)PetscRealPart(cpcol_gid[idx[jj]]);
          Edges[nEdges].cpid1 = idx[jj];
          Edges[nEdges].weight = PetscRealPart(ap[jj]);
          nEdges++;
        }
      }
      if (nn > 1) n_nz_row++;
      else if (iter == 1){
        /* should select this because it is technically in the MIS but lets not */
        ierr = PetscCDRemoveAll(agg_llists, lid);CHKERRQ(ierr);
      }
    }
    ierr = ISRestoreIndices(perm,&perm_ix);CHKERRQ(ierr);

    qsort(Edges, nEdges, sizeof(Edge), gamg_hem_compare);

    /* projection matrix */
    ierr = MatCreateAIJ(wcomm, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE, 1, 0, 1, 0, &P);CHKERRQ(ierr);

    /* clear matched flags */
    for (kk=0;kk<nloc;kk++) lid_matched[kk] = PETSC_FALSE;
    /* process - communicate - process */
    for (sub_it=0;sub_it<n_sub_its;sub_it++){
      PetscInt nactive_edges;

      ierr = VecGetArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr);
      for (kk=nactive_edges=0;kk<nEdges;kk++){
        /* HEM */
        const Edge *e = &Edges[kk];
        const PetscInt lid0=e->lid0,gid1=e->gid1,cpid1=e->cpid1,gid0=lid0+my0,lid1=gid1-my0;
        PetscBool isOK = PETSC_TRUE;

        /* skip if either (local) vertex is done already */
        if (lid_matched[lid0] || (gid1>=my0 && gid1<Iend && lid_matched[gid1-my0])) {
          continue;
        }
        /* skip if ghost vertex is done */
        if (cpid1 != -1 && cpcol_matched[cpid1]) {
          continue;
        }

        nactive_edges++;
        /* skip if I have a bigger edge someplace (lid_max_ew gets updated) */
        if (PetscRealPart(lid_max_ew[lid0]) > e->weight + 1.e-12) {
          continue;
        }

        if (cpid1 == -1) {
          if (PetscRealPart(lid_max_ew[lid1]) > e->weight + 1.e-12) {
            continue;
          }
        } else {
          /* see if edge might get matched on other proc */
          PetscReal g_max_e = PetscRealPart(cpcol_max_ew[cpid1]);
          if (g_max_e > e->weight + 1.e-12) {
            continue;
          } else if (e->weight > g_max_e - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[cpid1]) > rank) {
            /* check for max_e == to this edge and larger processor that will deal with this */
            continue;
          }
        }

        /* check ghost for v0 */
        if (isOK){
          PetscReal max_e,ew;
          if ((ix=lid_cprowID[lid0]) != -1) { /* if I have any ghost neighbors */
            ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
            ap = matB->a + ii[ix];
            idx = matB->j + ii[ix];
            for (jj=0 ; jj<n && isOK; jj++) {
              PetscInt lidj = idx[jj];
              if (cpcol_matched[lidj]) continue;
              ew = PetscRealPart(ap[jj]); max_e = PetscRealPart(cpcol_max_ew[lidj]);
              /* check for max_e == to this edge and larger processor that will deal with this */
              if (ew > max_e - 1.e-12 && ew > PetscRealPart(lid_max_ew[lid0]) - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[lidj]) > rank){
                isOK = PETSC_FALSE;
              }
            }
          }

          /* for v1 */
          if (cpid1 == -1 && isOK){
            if ((ix=lid_cprowID[lid1]) != -1) { /* if I have any ghost neighbors */
              ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
              ap = matB->a + ii[ix];
              idx = matB->j + ii[ix];
              for (jj=0 ; jj<n && isOK ; jj++) {
                PetscInt lidj = idx[jj];
                if (cpcol_matched[lidj]) continue;
                ew = PetscRealPart(ap[jj]); max_e = PetscRealPart(cpcol_max_ew[lidj]);
                /* check for max_e == to this edge and larger processor that will deal with this */
                if (ew > max_e - 1.e-12 && ew > PetscRealPart(lid_max_ew[lid1]) - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[lidj]) > rank) {
                  isOK = PETSC_FALSE;
                }
              }
            }
          }
        }

        /* do it */
        if (isOK){
          if (cpid1 == -1) {
            lid_matched[lid1] = PETSC_TRUE;  /* keep track of what we've done this round */
            ierr = PetscCDAppendRemove(agg_llists, lid0, lid1);CHKERRQ(ierr);
          } else if (sub_it != n_sub_its-1) {
            /* add gid1 to list of ghost deleted by me -- I need their children */
            proc = cpcol_pe[cpid1];
            cpcol_matched[cpid1] = PETSC_TRUE; /* messing with VecGetArray array -- needed??? */
            ierr = PetscCDAppendID(deleted_list, proc, cpid1);CHKERRQ(ierr); /* cache to send messages */
            ierr = PetscCDAppendID(deleted_list, proc, lid0);CHKERRQ(ierr);
          } else {
            continue;
          }
          lid_matched[lid0] = PETSC_TRUE; /* keep track of what we've done this round */
          /* set projection */
          ierr = MatSetValues(P,1,&gid0,1,&gid0,&one,INSERT_VALUES);CHKERRQ(ierr);
          ierr = MatSetValues(P,1,&gid1,1,&gid0,&one,INSERT_VALUES);CHKERRQ(ierr);
        } /* matched */
      } /* edge loop */

      /* deal with deleted ghost on first pass */
      if (size>1 && sub_it != n_sub_its-1){
        PetscCDPos  pos;  PetscBool ise = PETSC_FALSE;
        PetscInt    nSend1, **sbuffs1,nSend2;
#define REQ_BF_SIZE 100
        MPI_Request *sreqs2[REQ_BF_SIZE],*rreqs2[REQ_BF_SIZE];
        MPI_Status  status;

        /* send request */
        for (proc=0,nSend1=0;proc<size;proc++){
          ierr = PetscCDEmptyAt(deleted_list,proc,&ise);CHKERRQ(ierr);
          if (!ise) nSend1++;
        }
        ierr = PetscMalloc(nSend1*sizeof(PetscInt*), &sbuffs1);CHKERRQ(ierr);
        /* ierr = PetscMalloc4(nSend1, PetscInt*, sbuffs1, nSend1, PetscInt*, rbuffs1, nSend1, MPI_Request*, sreqs1, nSend1, MPI_Request*, rreqs1);CHKERRQ(ierr); */
        /* PetscFree4(sbuffs1,rbuffs1,sreqs1,rreqs1); */
        for (proc=0,nSend1=0;proc<size;proc++){
          /* count ghosts */
          ierr = PetscCDSizeAt(deleted_list,proc,&n);CHKERRQ(ierr);
          if (n>0){
#define CHUNCK_SIZE 100
            PetscInt    *sbuff,*pt;
            MPI_Request *request;
            assert(n%2==0);
            n /= 2;
            ierr = PetscMalloc((2 + 2*n + n*CHUNCK_SIZE)*sizeof(PetscInt) + 2*sizeof(MPI_Request), &sbuff);CHKERRQ(ierr);
            /* PetscMalloc4(2+2*n,PetscInt,sbuffs1[nSend1],n*CHUNCK_SIZE,PetscInt,rbuffs1[nSend1],1,MPI_Request,rreqs2[nSend1],1,MPI_Request,sreqs2[nSend1]); */
            /* save requests */
            sbuffs1[nSend1] = sbuff;
            request = (MPI_Request*)sbuff;
            sbuff = pt = (PetscInt*)(request+1);
            *pt++ = n; *pt++ = rank;

            ierr = PetscCDGetHeadPos(deleted_list,proc,&pos);CHKERRQ(ierr);
            while(pos){
              PetscInt lid0, cpid, gid;
              ierr = PetscLLNGetID(pos, &cpid);CHKERRQ(ierr);
              gid = (PetscInt)PetscRealPart(cpcol_gid[cpid]);
              ierr = PetscCDGetNextPos(deleted_list,proc,&pos);CHKERRQ(ierr);
              ierr = PetscLLNGetID(pos, &lid0);CHKERRQ(ierr);
              ierr = PetscCDGetNextPos(deleted_list,proc,&pos);CHKERRQ(ierr);
              *pt++ = gid; *pt++ = lid0;
            }
            /* send request tag1 [n, proc, n*[gid1,lid0] ] */
            ierr = MPI_Isend(sbuff, 2*n+2, MPIU_INT, proc, tag1, wcomm, request);CHKERRQ(ierr);
            /* post recieve */
            request = (MPI_Request*)pt;
            rreqs2[nSend1] = request; /* cache recv request */
            pt = (PetscInt*)(request+1);
            ierr = MPI_Irecv(pt, n*CHUNCK_SIZE, MPIU_INT, proc, tag2, wcomm, request);CHKERRQ(ierr);
            /* clear list */
            ierr = PetscCDRemoveAll(deleted_list, proc);CHKERRQ(ierr);
            nSend1++;
          }
        }
        /* recieve requests, send response, clear lists */
        kk = nactive_edges;
        ierr = MPI_Allreduce(&kk,&nactive_edges,1,MPIU_INT,MPI_SUM,wcomm);CHKERRQ(ierr); /* not correct syncronization and global */
        nSend2 = 0;
        while(1){
#define BF_SZ 10000
          PetscMPIInt flag,count;
          PetscInt    rbuff[BF_SZ],*pt,*pt2,*pt3,count2,*sbuff,count3;
          MPI_Request *request;
          ierr = MPI_Iprobe(MPI_ANY_SOURCE, tag1, wcomm, &flag, &status);CHKERRQ(ierr);
          if (!flag) break;
          ierr = MPI_Get_count(&status, MPIU_INT, &count);CHKERRQ(ierr);
          if (count > BF_SZ) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"buffer too small for recieve: %d",count);
          proc = status.MPI_SOURCE;
          /* recieve request tag1 [n, proc, n*[gid1,lid0] ] */
          ierr = MPI_Recv(rbuff, count, MPIU_INT, proc, tag1, wcomm, &status);CHKERRQ(ierr);
          /* count sends */
          pt = rbuff; count3 = count2 = 0;
          n = *pt++; kk = *pt++;           assert(kk==proc);
          while(n--){
            PetscInt gid1=*pt++, lid1=gid1-my0; kk=*pt++;  assert(lid1>=0 && lid1<nloc);
            if (lid_matched[lid1]){
              PetscPrintf(PETSC_COMM_SELF,"\t *** [%d]%s %d) ERROR recieved deleted gid %d, deleted by (lid) %d from proc %d\n",rank,__FUNCT__,sub_it,gid1,kk);
              PetscSleep(1);
            }
            assert(!lid_matched[lid1]);
            lid_matched[lid1] = PETSC_TRUE; /* keep track of what we've done this round */
            ierr = PetscCDSizeAt(agg_llists, lid1, &kk);CHKERRQ(ierr);
            count2 += kk + 2;
            count3++; /* number of verts requested (n) */
          }
          assert(pt-rbuff==count);
          if (count2 > count3*CHUNCK_SIZE) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Irecv will be too small: %d",count2);
          /* send tag2 *[lid0, n, n*[gid] ] */
          ierr = PetscMalloc(count2*sizeof(PetscInt) + sizeof(MPI_Request), &sbuff);CHKERRQ(ierr);
          request = (MPI_Request*)sbuff;
          sreqs2[nSend2++] = request; /* cache request */
          if (nSend2==REQ_BF_SIZE) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"buffer too small for requests: %d",nSend2);
          pt2 = sbuff = (PetscInt*)(request+1);
          pt  = rbuff;
          n = *pt++; kk = *pt++;           assert(kk==proc);
          while(n--){
            /* read [n, proc, n*[gid1,lid0] */
            PetscInt gid1=*pt++, lid1=gid1-my0, lid0=*pt++;   assert(lid1>=0 && lid1<nloc);
            /* write [lid0, n, n*[gid] ] */
            *pt2++ = lid0;
            pt3 = pt2++; /* save pointer for later */
            /* for (pos=PetscCDGetHeadPos(agg_llists,lid1) ; pos ; pos=PetscCDGetNextPos(agg_llists,lid1,pos)){ */
            ierr = PetscCDGetHeadPos(agg_llists,lid1,&pos);CHKERRQ(ierr);
            while(pos){
              PetscInt gid;
              ierr = PetscLLNGetID(pos, &gid);CHKERRQ(ierr);
              ierr = PetscCDGetNextPos(agg_llists,lid1,&pos);CHKERRQ(ierr);
              *pt2++ = gid;
            }
            *pt3 = (pt2-pt3)-1;
            /* clear list */
            ierr = PetscCDRemoveAll(agg_llists, lid1);CHKERRQ(ierr);
          }
          assert(pt2-sbuff==count2); assert(pt-rbuff==count);
          /* send requested data tag2 *[lid0, n, n*[gid1] ] */
          ierr = MPI_Isend(sbuff, count2, MPIU_INT, proc, tag2, wcomm, request);CHKERRQ(ierr);
        }

        /* recieve tag2 *[lid0, n, n*[gid] ] */
        for (kk=0;kk<nSend1;kk++){
          PetscMPIInt count;
          MPI_Request *request;
          PetscInt    *pt, *pt2;
          request = rreqs2[kk]; /* no need to free -- buffer is in 'sbuffs1' */
          ierr = MPI_Wait(request, &status);CHKERRQ(ierr);
          ierr = MPI_Get_count(&status, MPIU_INT, &count);CHKERRQ(ierr);
          pt = pt2 = (PetscInt*)(request+1);
          while(pt-pt2 < count){
            PetscInt lid0 = *pt++, n = *pt++;           assert(lid0>=0 && lid0<nloc);
            while(n--){
              PetscInt gid1 = *pt++;
              ierr = PetscCDAppendID(agg_llists, lid0, gid1);CHKERRQ(ierr);
            }
          }
          assert(pt-pt2==count);
        }

        /* wait for tag1 isends */
        while(nSend1--){
          MPI_Request *request;
          request = (MPI_Request*)sbuffs1[nSend1];
          ierr = MPI_Wait(request, &status);CHKERRQ(ierr);
          ierr = PetscFree(request);CHKERRQ(ierr);
        }
        ierr = PetscFree(sbuffs1);CHKERRQ(ierr);

        /* wait for tag2 isends */
        while(nSend2--){
          MPI_Request *request = sreqs2[nSend2];
          ierr = MPI_Wait(request, &status);CHKERRQ(ierr);
          ierr = PetscFree(request);CHKERRQ(ierr);
        }

        ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);
        ierr = VecRestoreArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr);

        /* get 'cpcol_matched' - use locMaxPE, ghostMaxEdge, cpcol_max_ew */
        for (kk=0,gid=my0;kk<nloc;kk++,gid++) {
          PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0;
          ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */
        }
        ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr);
        ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr);
        ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr =   VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);
        ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr);
        for (kk=0;kk<n;kk++) {
          cpcol_matched[kk] = (PetscBool)(PetscRealPart(cpcol_max_ew[kk]) != 0.0);
        }

        ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);
      } /* size > 1 */

      /* compute 'locMaxEdge' */
      ierr = VecRestoreArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr);
      for (kk=0,gid=my0;kk<nloc;kk++,gid++){
        PetscReal   max_e = 0.,tt;
        PetscScalar vval;
        PetscInt    lid = kk;
        if (lid_matched[lid]) vval = 0.;
        else {
          ii = matA->i; n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid];
          ap = matA->a + ii[lid];
          for (jj=0; jj<n; jj++) {
            PetscInt lidj = idx[jj];
            if (lid_matched[lidj]) continue; /* this is new - can change local max */
            if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]);
          }
          if (lid_cprowID && (ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
            ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
            ap = matB->a + ii[ix];
            idx = matB->j + ii[ix];
            for (jj=0 ; jj<n ; jj++) {
              PetscInt lidj = idx[jj];
              if (cpcol_matched[lidj]) continue;
              if ((tt=PetscRealPart(ap[jj])) > max_e) max_e = tt;
            }
          }
        }
        vval = (PetscScalar)max_e;
        ierr = VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */
      }
      ierr = VecAssemblyBegin(locMaxEdge);CHKERRQ(ierr);
      ierr = VecAssemblyEnd(locMaxEdge);CHKERRQ(ierr);

      if (size>1 && sub_it != n_sub_its-1){
        /* compute 'cpcol_max_ew' */
        ierr = VecScatterBegin(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr =   VecScatterEnd(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);
        ierr = VecGetArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr);

        /* compute 'cpcol_max_pe' */
        for (kk=0,gid=my0;kk<nloc;kk++,gid++){
          PetscInt    lid = kk;
          PetscReal   ew,v1_max_e,v0_max_e=PetscRealPart(lid_max_ew[lid]);
          PetscScalar vval;
          PetscMPIInt max_pe=rank,pe;
          if (lid_matched[lid]) vval = (PetscScalar)rank;
          else if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
            ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix];
            ap = matB->a + ii[ix];
            idx = matB->j + ii[ix];
            for (jj=0 ; jj<n ; jj++) {
              PetscInt lidj = idx[jj];
              if (cpcol_matched[lidj]) continue;
              ew = PetscRealPart(ap[jj]); v1_max_e = PetscRealPart(cpcol_max_ew[lidj]);
              /* get max pe that has a max_e == to this edge w */
              if ((pe=cpcol_pe[idx[jj]]) > max_pe && ew > v1_max_e - 1.e-12 && ew > v0_max_e - 1.e-12) max_pe = pe;
              assert(ew < v0_max_e + 1.e-12 && ew < v1_max_e + 1.e-12);
            }
            vval = (PetscScalar)max_pe;
          }
          ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr);
        }
        ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr);
        ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr);

        ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr =   VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
        ierr = VecGetArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr);
        ierr = VecRestoreArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr);
      } /* deal with deleted ghost */
      if (verbose>2) PetscPrintf(wcomm,"\t[%d]%s %d.%d: %d active edges.\n",
                                rank,__FUNCT__,iter,sub_it,nactive_edges);
      if (!nactive_edges) break;
    } /* sub_it loop */

    /* clean up iteration */
    ierr = PetscFree(Edges);CHKERRQ(ierr);
    if (mpimat){
      ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr);
      ierr = VecDestroy(&ghostMaxEdge);CHKERRQ(ierr);
      ierr = VecRestoreArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr);
      ierr = VecDestroy(&ghostMaxPE);CHKERRQ(ierr);
      ierr = PetscFree(cpcol_pe);CHKERRQ(ierr);
      ierr = PetscFree(cpcol_matched);CHKERRQ(ierr);
    }

    ierr = VecDestroy(&locMaxEdge);CHKERRQ(ierr);
    ierr = VecDestroy(&locMaxPE);CHKERRQ(ierr);

    if (mpimat){
      ierr = VecRestoreArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr);
    }

    /* create next G if needed */
    if (iter == n_iter) { /* hard wired test - need to look at full surrounded nodes or something */
      ierr = MatDestroy(&P);CHKERRQ(ierr);
      ierr = MatDestroy(&cMat);CHKERRQ(ierr);
      break;
    } else {
      Vec diag;
      /* add identity for unmatched vertices so they stay alive */
      for (kk=0,gid=my0;kk<nloc;kk++,gid++){
        if (!lid_matched[kk]) {
          gid = kk+my0;
          ierr = MatGetRow(cMat,gid,&n,0,0);CHKERRQ(ierr);
          if (n>1){
            ierr = MatSetValues(P,1,&gid,1,&gid,&one,INSERT_VALUES);CHKERRQ(ierr);
          }
          ierr = MatRestoreRow(cMat,gid,&n,0,0);CHKERRQ(ierr);
        }
      }
      ierr = MatAssemblyBegin(P,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
      ierr = MatAssemblyEnd(P,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

      /* project to make new graph with colapsed edges */
      ierr = MatPtAP(cMat,P,MAT_INITIAL_MATRIX,1.0,&tMat);CHKERRQ(ierr);
      ierr = MatDestroy(&P);CHKERRQ(ierr);
      ierr = MatDestroy(&cMat);CHKERRQ(ierr);
      cMat = tMat;
      ierr = MatGetVecs(cMat, &diag, 0);CHKERRQ(ierr);
      ierr = MatGetDiagonal(cMat, diag);CHKERRQ(ierr); /* effectively PCJACOBI */
      ierr = VecReciprocal(diag);CHKERRQ(ierr);
      ierr = VecSqrtAbs(diag);CHKERRQ(ierr);
      ierr = MatDiagonalScale(cMat, diag, diag);CHKERRQ(ierr);
      ierr = VecDestroy(&diag);CHKERRQ(ierr);
    }
  } /* coarsen iterator */

  /* make fake matrix */
  if (size>1){
    Mat        mat;
    PetscCDPos pos;
    PetscInt   gid, NN, MM, jj = 0, mxsz = 0;

    for (kk=0;kk<nloc;kk++){
      ierr = PetscCDSizeAt(agg_llists, kk, &jj);CHKERRQ(ierr);
      if (jj > mxsz)  mxsz = jj;
    }
    ierr = MatGetSize(a_Gmat, &MM, &NN);CHKERRQ(ierr);
    if (mxsz > MM-nloc) mxsz = MM-nloc;

    ierr = MatCreateAIJ(wcomm, nloc, nloc,PETSC_DETERMINE, PETSC_DETERMINE,0, 0, mxsz, 0, &mat);CHKERRQ(ierr);

    /* */
    for (kk=0,gid=my0;kk<nloc;kk++,gid++){
      /* for (pos=PetscCDGetHeadPos(agg_llists,kk) ; pos ; pos=PetscCDGetNextPos(agg_llists,kk,pos)){ */
      ierr = PetscCDGetHeadPos(agg_llists,kk,&pos);CHKERRQ(ierr);
      while(pos){
        PetscInt gid1;
        ierr = PetscLLNGetID(pos, &gid1);CHKERRQ(ierr);
        ierr = PetscCDGetNextPos(agg_llists,kk,&pos);CHKERRQ(ierr);

        if (gid1 < my0 || gid1 >= my0+nloc) {
          ierr = MatSetValues(mat,1,&gid,1,&gid1,&one,ADD_VALUES);CHKERRQ(ierr);
        }
      }
    }
    ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
    ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

    ierr = PetscCDSetMat(agg_llists, mat);CHKERRQ(ierr);
  }

  ierr = PetscFree(lid_cprowID);CHKERRQ(ierr);
  ierr = PetscFree(lid_gid);CHKERRQ(ierr);
  ierr = PetscFree(lid_matched);CHKERRQ(ierr);
  ierr = PetscCDDestroy(deleted_list);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}