AmesosBTF_CrsMatrix::NewTypeRef
AmesosBTF_CrsMatrix::
operator()( OriginalTypeRef orig )
{
  origObj_ = &orig;
  const Epetra_BlockMap & OldRowMap = orig.RowMap();
  const Epetra_BlockMap & OldColMap = orig.ColMap();
  
  // Check if the matrix is on one processor.
  int myMatProc = -1, matProc = -1;
  int myPID = orig.Comm().MyPID();
  for (int proc=0; proc<orig.Comm().NumProc(); proc++) 
  {
    if (orig.NumGlobalNonzeros() == orig.NumMyNonzeros())
      myMatProc = myPID;
  }
  orig.Comm().MaxAll( &myMatProc, &matProc, 1 );
  
  if( orig.RowMap().DistributedGlobal() && matProc == -1)
    { cout << "FAIL for Global!\n"; abort(); }
  if( orig.IndicesAreGlobal() && matProc == -1)
    { cout << "FAIL for Global Indices!\n"; abort(); }
 
  int nGlobal = orig.NumGlobalRows(); 
  int n = orig.NumMyRows();
  int nnz = orig.NumMyNonzeros();
  
  if( debug_ )
  {
    cout << "Orig Matrix:\n";
    cout << orig << endl;
  }

  // Create std CRS format (without elements above the threshold)
  vector<int> ia(n+1,0);
  int maxEntries = orig.MaxNumEntries();
  vector<int> ja(nnz), ja_tmp(nnz);
  vector<double> jva_tmp(maxEntries);
  int cnt;

  Epetra_CrsGraph strippedGraph( Copy, OldRowMap, OldColMap, 0 );

  for( int i = 0; i < n; ++i )
  {
    orig.ExtractMyRowCopy( i, maxEntries, cnt, &jva_tmp[0], &ja_tmp[0] );
    ia[i+1] = ia[i];
    for( int j = 0; j < cnt; ++j )
      if( fabs(jva_tmp[j]) > threshold_ )
        ja[ ia[i+1]++ ] = ja_tmp[j];

    int new_cnt = ia[i+1] - ia[i];
    strippedGraph.InsertMyIndices( i, new_cnt, &ja[ ia[i] ] );
  }
  nnz = ia[n];
  strippedGraph.FillComplete();
  
  if( debug_ )
  {
    cout << "Stripped Graph\n";
    cout << strippedGraph;
  }

  // Compute the BTF permutation only on the processor that has the graph.
  if ( matProc == myPID ) {
    
    if( debug_ )
      {
	cout << "-----------------------------------------\n";
	cout << "CRS Format Graph (stripped) \n";
	cout << "-----------------------------------------\n";
	for( int i = 0; i < n; ++i )
	  {
	    cout << ia[i] << " - " << ia[i+1] << " : ";
	    for( int j = ia[i]; j<ia[i+1]; ++j )
	      cout << " " << ja[j];
	    cout << endl;
	  }
	cout << "-----------------------------------------\n";
      }
  
    // Transpose the graph, not the values
    int j=0, next=0;
    vector<int> ia_tmp(n+1,0);

    // Compute row lengths
    for (int i = 0; i < n; i++)
        for (int k = ia[i]; k < ia[i+1]; k++)
            ++ia_tmp[ ja[k]+1 ];

    // Compute pointers from row lengths
    ia_tmp[0] = 0;
    for (int i = 0; i < n; i++)
        ia_tmp[i+1] += ia_tmp[i];

    // Copy over indices
    for (int i = 0; i < n; i++) {
        for (int k = ia[i]; k < ia[i+1]; k++) {
            j = ja[k];
            next = ia_tmp[j];
            ja_tmp[next] = i;
            ia_tmp[j] = next + 1;
        }
    }

    // Reshift ia_tmp
    for (int i=n-1; i >= 0; i--) ia_tmp[i+1] = ia_tmp[i];
    ia_tmp[0] = 0;

    // Transformation information
    int numMatch = 0;       // number of nonzeros on diagonal after permutation.
    double maxWork =  0.0;  // no limit on how much work to perform in max-trans.
    double workPerf = 0.0;  // how much work was performed in max-trans.
    
    // Create a work vector for the BTF code.
    vector<int> work(5*n);
    
    // Storage for the row and column permutations.
    vector<int> rowperm(n);
    vector<int> colperm(n);
    vector<int> blockptr(n+1);
    
    // NOTE:  The permutations are sent in backwards since the matrix is transposed.
    // On output, rowperm and colperm are the row and column permutations of A, where 
    // i = BTF_UNFLIP(rowperm[k]) if row i of A is the kth row of P*A*Q, and j = colperm[k] 
    // if column j of A is the kth column of P*A*Q.  If rowperm[k] < 0, then the 
    // (k,k)th entry in P*A*Q is structurally zero.
    
    numBlocks_ = amesos_btf_order( n, &ia_tmp[0], &ja_tmp[0], maxWork, &workPerf,
			    &rowperm[0], &colperm[0], &blockptr[0], 
			    &numMatch, &work[0] );
    
    // Reverse ordering of permutation to get upper triangular form, if necessary.
    rowPerm_.resize( n );
    colPerm_.resize( n ); 
    blockptr.resize( numBlocks_+1 );
    blockPtr_.resize( numBlocks_+1 );
    if (!upperTri_) {
      for( int i = 0; i < n; ++i )
	{
	  rowPerm_[i] = BTF_UNFLIP(rowperm[(n-1)-i]);
	  colPerm_[i] = colperm[(n-1)-i];
	}
      for( int i = 0; i < numBlocks_+1; ++i ) 
	{
	  blockPtr_[i] = n - blockptr[numBlocks_-i];
	}
    }
    else {
      colPerm_ = colperm;
      blockPtr_ = blockptr;
      for( int i = 0; i < n; ++i )
	{
	  rowPerm_[i] = BTF_UNFLIP(rowperm[i]);
	}
    }
    
    if( debug_ ) {
      cout << "-----------------------------------------\n";
      cout << "BTF Output (n = " << n << ")\n";
      cout << "-----------------------------------------\n";
      cout << "Num Blocks: " << numBlocks_ << endl;
      cout << "Num NNZ Diags: " << numMatch << endl;
      cout << "RowPerm and ColPerm \n";
      for( int i = 0; i<n; ++i )
	cout << rowPerm_[i] << "\t" << colPerm_[i] << endl;
      cout << "-----------------------------------------\n";
    }  
  }

  // Broadcast the BTF permutation information to all processors.
  rowPerm_.resize( nGlobal );
  colPerm_.resize( nGlobal );

  orig.Comm().Broadcast(&rowPerm_[0], nGlobal, matProc);
  orig.Comm().Broadcast(&colPerm_[0], nGlobal, matProc);
  orig.Comm().Broadcast(&numBlocks_, 1, matProc);

  blockPtr_.resize( numBlocks_+1 );
  orig.Comm().Broadcast(&blockPtr_[0], numBlocks_+1, matProc);
  
  //Generate New Domain and Range Maps
  //for now, assume they start out as identical
  vector<int> myElements( n );
  OldRowMap.MyGlobalElements( &myElements[0] );
  
  vector<int> newDomainElements( n );
  vector<int> newRangeElements( n );
  for( int i = 0; i < n; ++i )
  {
    newRangeElements[ i ] = myElements[ rowPerm_[i] ];
    newDomainElements[ i ] = myElements[ colPerm_[i] ];
  }

  NewRowMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, n, &newRangeElements[0], OldRowMap.IndexBase(), OldRowMap.Comm() ) );
  NewColMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, n, &newDomainElements[0], OldColMap.IndexBase(), OldColMap.Comm() ) );

  if( debug_ )
  {
    cout << "New Row Map\n";
    cout << *NewRowMap_ << endl;
    cout << "New Col Map\n";
    cout << *NewColMap_ << endl;
  }

  //Generate New Graph
  NewGraph_ = Teuchos::rcp( new Epetra_CrsGraph( Copy, *NewRowMap_, *NewColMap_, 0 ) );
  Importer_ = Teuchos::rcp( new Epetra_Import( *NewRowMap_, OldRowMap ) );
  NewGraph_->Import( strippedGraph, *Importer_, Insert );
  NewGraph_->FillComplete();

  if( debug_ )
  {
    cout << "NewGraph\n";
    cout << *NewGraph_;
  }

  NewMatrix_ = Teuchos::rcp( new Epetra_CrsMatrix( Copy, *NewGraph_ ) );
  NewMatrix_->Import( orig, *Importer_, Insert );
  NewMatrix_->FillComplete();

  if( debug_ )
  {
    cout << "New CrsMatrix\n";
    cout << *NewMatrix_ << endl;
  }

  newObj_ = &*NewMatrix_;

  return *NewMatrix_;
}
AmesosBTFGlobal_LinearProblem::NewTypeRef
AmesosBTFGlobal_LinearProblem::
operator()( OriginalTypeRef orig )
{
  origObj_ = &orig;

  // Extract the matrix and vectors from the linear problem
  OldRHS_ = Teuchos::rcp( orig.GetRHS(), false );
  OldLHS_ = Teuchos::rcp( orig.GetLHS(), false );
  OldMatrix_ = Teuchos::rcp( dynamic_cast<Epetra_CrsMatrix *>( orig.GetMatrix() ), false );
	
  int nGlobal = OldMatrix_->NumGlobalRows(); 
  int n = OldMatrix_->NumMyRows();

  // Check if the matrix is on one processor.
  int myMatProc = -1, matProc = -1;
  int myPID = OldMatrix_->Comm().MyPID();
  int numProcs = OldMatrix_->Comm().NumProc();

  const Epetra_BlockMap& oldRowMap = OldMatrix_->RowMap();

  // Get some information about the parallel distribution.
  int maxMyRows = 0;
  std::vector<int> numGlobalElem( numProcs );
  OldMatrix_->Comm().GatherAll(&n, &numGlobalElem[0], 1);
  OldMatrix_->Comm().MaxAll(&n, &maxMyRows, 1);

  for (int proc=0; proc<numProcs; proc++) 
  {
    if (OldMatrix_->NumGlobalNonzeros() == OldMatrix_->NumMyNonzeros())
      myMatProc = myPID;
  }

  OldMatrix_->Comm().MaxAll( &myMatProc, &matProc, 1 );

  Teuchos::RCP<Epetra_CrsMatrix> serialMatrix;
  Teuchos::RCP<Epetra_Map> serialMap;	
  if( oldRowMap.DistributedGlobal() && matProc == -1) 
  {
    // The matrix is distributed and needs to be moved to processor zero.
    // Set the zero processor as the master.
    matProc = 0;
    serialMap = Teuchos::rcp( new Epetra_Map( Epetra_Util::Create_Root_Map( OldMatrix_->RowMap(), matProc ) ) );
    
    Epetra_Import serialImporter( *serialMap, OldMatrix_->RowMap() );
    serialMatrix = Teuchos::rcp( new Epetra_CrsMatrix( Copy, *serialMap, 0 ) );
    serialMatrix->Import( *OldMatrix_, serialImporter, Insert );
    serialMatrix->FillComplete();
  }
  else {
    // The old matrix has already been moved to one processor (matProc).
    serialMatrix = OldMatrix_;
  }

  if( debug_ )
  {
    cout << "Original (serial) Matrix:\n";
    cout << *serialMatrix << endl;
  }

  // Obtain the current row and column orderings
  std::vector<int> origGlobalRows(nGlobal), origGlobalCols(nGlobal);
  serialMatrix->RowMap().MyGlobalElements( &origGlobalRows[0] );
  serialMatrix->ColMap().MyGlobalElements( &origGlobalCols[0] );
  
  // Perform reindexing on the full serial matrix (needed for BTF).
  Epetra_Map reIdxMap( serialMatrix->RowMap().NumGlobalElements(), serialMatrix->RowMap().NumMyElements(), 0, serialMatrix->Comm() );
  Teuchos::RCP<EpetraExt::ViewTransform<Epetra_CrsMatrix> > reIdxTrans =
    Teuchos::rcp( new EpetraExt::CrsMatrix_Reindex( reIdxMap ) );
  Epetra_CrsMatrix newSerialMatrix = (*reIdxTrans)( *serialMatrix );
  reIdxTrans->fwd();
  
  // Compute and apply BTF to the serial CrsMatrix and has been filtered by the threshold
  EpetraExt::AmesosBTF_CrsMatrix BTFTrans( threshold_, upperTri_, verbose_, debug_ );
  Epetra_CrsMatrix newSerialMatrixBTF = BTFTrans( newSerialMatrix );
  
  rowPerm_ = BTFTrans.RowPerm();
  colPerm_ = BTFTrans.ColPerm();
  blockPtr_ = BTFTrans.BlockPtr();
  numBlocks_ = BTFTrans.NumBlocks();
 
  if (myPID == matProc && verbose_) {
    bool isSym = true;
    for (int i=0; i<nGlobal; ++i) {
      if (rowPerm_[i] != colPerm_[i]) {
        isSym = false;
        break;
      }
    }
    std::cout << "The BTF permutation symmetry (0=false,1=true) is : " << isSym << std::endl;
  }
  
  // Compute the permutation w.r.t. the original row and column GIDs.
  std::vector<int> origGlobalRowsPerm(nGlobal), origGlobalColsPerm(nGlobal);
  if (myPID == matProc) {
    for (int i=0; i<nGlobal; ++i) {
      origGlobalRowsPerm[i] = origGlobalRows[ rowPerm_[i] ];
      origGlobalColsPerm[i] = origGlobalCols[ colPerm_[i] ];
    }
  }
  OldMatrix_->Comm().Broadcast( &origGlobalRowsPerm[0], nGlobal, matProc );
  OldMatrix_->Comm().Broadcast( &origGlobalColsPerm[0], nGlobal, matProc );

  // Generate the full serial matrix that imports according to the previously computed BTF.
  Epetra_CrsMatrix newSerialMatrixT( Copy, newSerialMatrixBTF.RowMap(), 0 );
  newSerialMatrixT.Import( newSerialMatrix, *(BTFTrans.Importer()), Insert );
  newSerialMatrixT.FillComplete();
  
  if( debug_ )
  {
    cout << "Original (serial) Matrix permuted via BTF:\n";
    cout << newSerialMatrixT << endl;
  }

  // Perform reindexing on the full serial matrix (needed for balancing).
  Epetra_Map reIdxMap2( newSerialMatrixT.RowMap().NumGlobalElements(), newSerialMatrixT.RowMap().NumMyElements(), 0, newSerialMatrixT.Comm() );
  Teuchos::RCP<EpetraExt::ViewTransform<Epetra_CrsMatrix> > reIdxTrans2 =
    Teuchos::rcp( new EpetraExt::CrsMatrix_Reindex( reIdxMap2 ) );
  Epetra_CrsMatrix tNewSerialMatrixT = (*reIdxTrans2)( newSerialMatrixT );
  reIdxTrans2->fwd();

  Teuchos::RCP<Epetra_Map> balancedMap;
  
  if (balance_ == "linear") {
    
    // Distribute block somewhat evenly across processors
    std::vector<int> rowDist(numProcs+1,0);
    int balRows = nGlobal / numProcs + 1;
    int numRows = balRows, currProc = 1;
    for ( int i=0; i<numBlocks_ || currProc < numProcs; ++i ) {
      if (blockPtr_[i] > numRows) {
	rowDist[currProc++] = blockPtr_[i-1];
	numRows = blockPtr_[i-1] + balRows;
      }      
    }
    rowDist[numProcs] = nGlobal;
   
    // Create new Map based on this linear distribution.
    int numMyBalancedRows = rowDist[myPID+1]-rowDist[myPID];

    NewRowMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, numMyBalancedRows, &origGlobalRowsPerm[ rowDist[myPID] ], 0, OldMatrix_->Comm() ) );
    // Right now we do not explicitly build the column map and assume the BTF permutation is symmetric!
    //NewColMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, nGlobal, &colPerm_[0], 0, OldMatrix_->Comm() ) );
    
    if ( verbose_ ) 
      std::cout << "Processor " << myPID << " has " << numMyBalancedRows << " rows." << std::endl;    
    //balancedMap = Teuchos::rcp( new Epetra_Map( nGlobal, numMyBalancedRows, 0, serialMatrix->Comm() ) );
  }
  else if (balance_ == "isorropia") {
	
    // Compute block adjacency graph for partitioning.
    std::vector<double> weight;
    Teuchos::RCP<Epetra_CrsGraph> blkGraph;
    EpetraExt::BlockAdjacencyGraph adjGraph;
    blkGraph = adjGraph.compute( const_cast<Epetra_CrsGraph&>(tNewSerialMatrixT.Graph()), 
							numBlocks_, blockPtr_, weight, verbose_);
    Epetra_Vector rowWeights( View, blkGraph->Map(), &weight[0] );
    
    // Call Isorropia to rebalance this graph.
    Teuchos::RCP<Epetra_CrsGraph> balancedGraph =
      Isorropia::Epetra::create_balanced_copy( *blkGraph, rowWeights );
    
    int myNumBlkRows = balancedGraph->NumMyRows();    
    
    //std::vector<int> myGlobalElements(nGlobal);
    std::vector<int> newRangeElements(nGlobal), newDomainElements(nGlobal);
    int grid = 0, myElements = 0;
    for (int i=0; i<myNumBlkRows; ++i) {
      grid = balancedGraph->GRID( i );
      for (int j=blockPtr_[grid]; j<blockPtr_[grid+1]; ++j) {
	newRangeElements[myElements++] = origGlobalRowsPerm[j];
	//myGlobalElements[myElements++] = j;
      }
    }

    NewRowMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, myElements, &newRangeElements[0], 0, OldMatrix_->Comm() ) );
    // Right now we do not explicitly build the column map and assume the BTF permutation is symmetric!
    //NewColMap_ = Teuchos::rcp( new Epetra_Map( nGlobal, nGlobal, &colPerm_[0], 0, OldMatrix_->Comm() ) );
    //balancedMap = Teuchos::rcp( new Epetra_Map( nGlobal, myElements, &myGlobalElements[0], 0, serialMatrix->Comm() ) );

    if ( verbose_ ) 
      std::cout << "Processor " << myPID << " has " << myElements << " rows." << std::endl;
  }
  
  // Use New Domain and Range Maps to Generate Importer
  //for now, assume they start out as identical
  Epetra_Map OldRowMap = OldMatrix_->RowMap();
  Epetra_Map OldColMap = OldMatrix_->ColMap();
  
  if( debug_ )
  {
    cout << "New Row Map\n";
    cout << *NewRowMap_ << endl;
    //cout << "New Col Map\n";
    //cout << *NewColMap_ << endl;
  }

  // Generate New Graph
  // NOTE:  Right now we are creating the graph, assuming that the permutation is symmetric!
  // NewGraph_ = Teuchos::rcp( new Epetra_CrsGraph( Copy, *NewRowMap_, *NewColMap_, 0 ) );
  NewGraph_ = Teuchos::rcp( new Epetra_CrsGraph( Copy, *NewRowMap_, 0 ) );
  Importer_ = Teuchos::rcp( new Epetra_Import( *NewRowMap_, OldRowMap ) );
  Importer2_ = Teuchos::rcp( new Epetra_Import( OldRowMap, *NewRowMap_ ) );
  NewGraph_->Import( OldMatrix_->Graph(), *Importer_, Insert );
  NewGraph_->FillComplete();

  if( debug_ )
  {
    cout << "NewGraph\n";
    cout << *NewGraph_;
  }

  // Create new linear problem and import information from old linear problem
  NewMatrix_ = Teuchos::rcp( new Epetra_CrsMatrix( Copy, *NewGraph_ ) );
  NewMatrix_->Import( *OldMatrix_, *Importer_, Insert );
  NewMatrix_->FillComplete();

  NewLHS_ = Teuchos::rcp( new Epetra_MultiVector( *NewRowMap_, OldLHS_->NumVectors() ) );
  NewLHS_->Import( *OldLHS_, *Importer_, Insert );
  
  NewRHS_ = Teuchos::rcp( new Epetra_MultiVector( *NewRowMap_, OldRHS_->NumVectors() ) );
  NewRHS_->Import( *OldRHS_, *Importer_, Insert );

  if( debug_ )
  {
    cout << "New Matrix\n";
    cout << *NewMatrix_ << endl;
  }

  newObj_ = new Epetra_LinearProblem( &*NewMatrix_, &*NewLHS_, &*NewRHS_ );

  return *newObj_;
}