void BlockUtility::TGenerateRowStencil(const Epetra_CrsGraph& LocalBlockGraph,
                                      std::vector<int_type> RowIndices,
                                      std::vector< std::vector<int_type> >& RowStencil)
{
  // Get row indices
  int NumMyRows = LocalBlockGraph.NumMyRows();
  RowIndices.resize(NumMyRows);
  const Epetra_BlockMap& RowMap = LocalBlockGraph.RowMap();
  RowMap.MyGlobalElements(&RowIndices[0]);

  // Get stencil
  RowStencil.resize(NumMyRows);
  if (LocalBlockGraph.IndicesAreGlobal()) {
    for (int i=0; i<NumMyRows; i++) {
      int_type Row = RowIndices[i];
      int NumCols = LocalBlockGraph.NumGlobalIndices(Row);
      RowStencil[i].resize(NumCols);
      LocalBlockGraph.ExtractGlobalRowCopy(Row, NumCols, NumCols,
                                           &RowStencil[i][0]);
      for (int k=0; k<NumCols; k++)
        RowStencil[i][k] -= Row;
    }
  }
  else {
    for (int i=0; i<NumMyRows; i++) {
      int NumCols = LocalBlockGraph.NumMyIndices(i);
	  std::vector<int> RowStencil_local(NumCols);
      RowStencil[i].resize(NumCols);
      LocalBlockGraph.ExtractMyRowCopy(i, NumCols, NumCols,
                                       &RowStencil_local[0]);
      for (int k=0; k<NumCols; k++)
        RowStencil[i][k] = (int_type) ((int) (LocalBlockGraph.GCID64(RowStencil_local[k]) - RowIndices[i]));
    }
  }
}
  Teuchos::RCP<Epetra_CrsGraph> BlockAdjacencyGraph::compute( Epetra_CrsGraph& B, int nbrr, std::vector<int>&r, std::vector<double>& weights, bool verbose)
  {
    // Check if the graph is on one processor.
    int myMatProc = -1, matProc = -1;
    int myPID = B.Comm().MyPID();
    for (int proc=0; proc<B.Comm().NumProc(); proc++)
      {
	if (B.NumGlobalEntries() == B.NumMyEntries())
	  myMatProc = myPID;
      }
    B.Comm().MaxAll( &myMatProc, &matProc, 1 );
    
    if( matProc == -1)
      { cout << "FAIL for Global!  All CrsGraph entries must be on one processor!\n"; abort(); }
    
    int i= 0, j = 0, k, l = 0, p, pm, q = -1, ns;
    int tree_height;
    int error = -1;    /* error detected, possibly a problem with the input */
    int nrr;           /* number of rows in B */
    int nzM = 0;       /* number of edges in graph */
    int m = 0;         /* maximum number of nonzeros in any block row of B */
    int* colstack = 0; /* stack used to process each block row */
    int* bstree = 0;   /* binary search tree */
    std::vector<int> Mi, Mj, Mnum(nbrr+1,0);
    nrr = B.NumMyRows();
    if ( matProc == myPID && verbose )
      std::printf(" Matrix Size = %d      Number of Blocks = %d\n",nrr, nbrr);
    else
      nrr = -1;     /* Prevent processor from doing any computations */
    bstree = csr_bst(nbrr);  /* 0 : nbrr-1 */
    tree_height = ceil31log2(nbrr) + 1;
    error = -1;

    l = 0; j = 0; m = 0;
    for( i = 0; i < nrr; i++ ){
      if( i >= r[l+1] ){
	++l;                 /* new block row */
	m = EPETRA_MAX(m,j) ;   /* nonzeros in block row */
	j = B.NumGlobalIndices(i);
      }else{
	j += B.NumGlobalIndices(i);
      }
    }
    /* one more time for the final block */
     m = EPETRA_MAX(m,j) ;   /* nonzeros in block row */

    colstack = (int*) malloc( EPETRA_MAX(m,1) * sizeof(int) );
    // The compressed graph is actually computed twice,
    // due to concerns about memory limitations.  First, 
    // without memory allocation, just nzM is computed.  
    // Next Mj is allocated. Then, the second time, the
    // arrays are actually populated.
    nzM = 0; q = -1; l = 0;
    int * indices;
    int numEntries;
    for( i = 0; i <= nrr; i++ ){
      if( i >= r[l+1] ){
	if( q > 0 ) std::qsort(colstack,q+1,sizeof(int),compare_ints); /* sort stack */
	if( q >= 0 ) ns = 1; /* l, colstack[0] M */
	for( j=1; j<=q ; j++ ){ /* delete copies */
	  if( colstack[j] > colstack[j-1] ) ++ns;
	}
	nzM += ns; /*M->p[l+1] = M->p[l] + ns;*/
	++l;
	q = -1;
      }
      if( i < nrr ){
	B.ExtractMyRowView( i, numEntries, indices );
	for( k = 0; k < numEntries; k++){
	  j = indices[k];  ns = 0; p = 0;
	  while( (r[bstree[p]] > j)  ||  (j >= r[bstree[p]+1])  ){
	    if( r[bstree[p]] > j){
	      p = 2*p+1;
	    }else{
	      if( r[bstree[p]+1] <= j) p = 2*p+2;
	    }
	    ++ns;
	    if( p > nbrr || ns > tree_height ) {
	      error = j;
	      std::printf("error: p %d  nbrr %d  ns %d %d\n",p,nbrr,ns,j); break;
	    }
	  }
	  colstack[++q] = bstree[p];
	}
	//if( error >-1 ){ std::printf("%d\n",error); break; }
        // p > nbrr is a fatal error that is ignored
      }
    }
    
    if ( matProc == myPID && verbose )
      std::printf("nzM =  %d \n", nzM );
    Mi.resize( nzM );
    Mj.resize( nzM );
    q = -1; l = 0; pm = -1;
    for( i = 0; i <= nrr; i++ ){
      if( i >= r[l+1] ){
	if( q > 0 ) std::qsort(colstack,q+1,sizeof(colstack[0]),compare_ints); /* sort stack */
	if( q >= 0 ){
	  Mi[++pm] = l;
	  Mj[pm] = colstack[0];
	}
	for( j=1; j<=q ; j++ ){ /* delete copies */
	  if( colstack[j] > colstack[j-1] ){ /* l, colstack[j] */
	    Mi[++pm] = l;
	    Mj[pm] = colstack[j];
	  }
	}
	++l;
	Mnum[l] = pm + 1;
	
	/* sparse row format: M->p[l+1] = M->p[l] + ns; */
	q = -1;
      }
      if( i < nrr ){
	B.ExtractMyRowView( i, numEntries, indices );
	for( k = 0; k < numEntries; k++){
	  j = indices[k]; ns = 0; p = 0;
	  while( (r[bstree[p]] > j)  ||  (j >= r[bstree[p]+1])  ){
	    if( r[bstree[p]] > j){
	      p = 2*p+1;
	    }else{
	      if( r[bstree[p]+1] <= j) p = 2*p+2;
	    }
	    ++ns;
	  }
	  colstack[++q] = bstree[p];
	}
      }
    }
    if ( bstree ) free ( bstree );
    if ( colstack ) free( colstack );
    
    // Compute weights as number of rows in each block.
    weights.resize( nbrr );
    for( l=0; l<nbrr; l++) weights[l] = r[l+1] - r[l];
    
    // Compute Epetra_CrsGraph and return
    Teuchos::RCP<Epetra_Map> newMap;
    if ( matProc == myPID )
      newMap = Teuchos::rcp( new Epetra_Map(nbrr, nbrr, 0, B.Comm() ) );
    else
      newMap = Teuchos::rcp( new Epetra_Map( nbrr, 0, 0, B.Comm() ) );
    Teuchos::RCP<Epetra_CrsGraph> newGraph = Teuchos::rcp( new Epetra_CrsGraph( Copy, *newMap, 0 ) );
    for( l=0; l<newGraph->NumMyRows(); l++) {
      newGraph->InsertGlobalIndices( l, Mnum[l+1]-Mnum[l], &Mj[Mnum[l]] );
    }
    newGraph->FillComplete();
    
    return (newGraph);  
  }