//==============================================================================
BlockCrsMatrix::BlockCrsMatrix(
        const Epetra_CrsGraph & BaseGraph,
        const Epetra_CrsGraph & LocalBlockGraph,
        const Epetra_Comm & GlobalComm  ) 
  : Epetra_CrsMatrix( Copy, *(BlockUtility::GenerateBlockGraph( BaseGraph, LocalBlockGraph, GlobalComm )) ),
    BaseGraph_( BaseGraph ),
#ifndef EPETRA_NO_32BIT_GLOBAL_INDICES
    RowStencil_int_( ),
    RowIndices_int_( ),
#endif
#ifndef EPETRA_NO_64BIT_GLOBAL_INDICES
    RowStencil_LL_( ),
    RowIndices_LL_( ),
#endif
    ROffset_(BlockUtility::CalculateOffset64(BaseGraph.RowMap())),
    COffset_(BlockUtility::CalculateOffset64(BaseGraph.ColMap()))
{
#ifndef EPETRA_NO_32BIT_GLOBAL_INDICES
  if(Epetra_CrsMatrix::RowMatrixRowMap().GlobalIndicesInt() && LocalBlockGraph.RowMap().GlobalIndicesInt())
    BlockUtility::GenerateRowStencil(LocalBlockGraph, RowIndices_int_, RowStencil_int_);
  else
#endif
#ifndef EPETRA_NO_64BIT_GLOBAL_INDICES
  if(Epetra_CrsMatrix::RowMatrixRowMap().GlobalIndicesLongLong() && LocalBlockGraph.RowMap().GlobalIndicesLongLong())
    BlockUtility::GenerateRowStencil(LocalBlockGraph, RowIndices_LL_, RowStencil_LL_);
  else
#endif
    throw "EpetraExt::BlockCrsMatrix::BlockCrsMatrix: Error, Global indices unknown.";
}
int compute_graph_metrics(const Epetra_CrsGraph &graph,
            Isorropia::Epetra::CostDescriber &costs,
            double &myGoalWeight,
            double &balance, int &numCuts, double &cutWgt, double &cutn, double &cutl)
{
  const Epetra_BlockMap & rmap = graph.RowMap();
  const Epetra_BlockMap & cmap = graph.ColMap();

  int maxEdges = cmap.NumMyElements();

  std::vector<std::vector<int> > myRows(rmap.NumMyElements());

  if (maxEdges > 0){
    int numEdges = 0;
    int *nborLID  = new int [maxEdges];
    for (int i=0; i<rmap.NumMyElements(); i++){

      graph.ExtractMyRowCopy(i, maxEdges, numEdges, nborLID);
      std::vector<int> cols(numEdges);
      for (int j=0; j<numEdges; j++){
        cols[j] = nborLID[j];
      }
      myRows[i] = cols;
    }
    delete [] nborLID;
  }
 
  return compute_graph_metrics(rmap, cmap, myRows, costs, myGoalWeight,
                               balance, numCuts, cutWgt, cutn, cutl);

}
int compute_hypergraph_metrics(const Epetra_CrsGraph &graph,
            Isorropia::Epetra::CostDescriber &costs,
            double &myGoalWeight,
            double &balance, double &cutn, double &cutl)  // output
{
  return compute_hypergraph_metrics(graph.RowMap(), graph.ColMap(),
                                     graph.NumGlobalCols(), costs,
                                     myGoalWeight,
                                     balance, cutn, cutl);
}
BlockCrsMatrix::BlockCrsMatrix(
        const Epetra_CrsGraph & BaseGraph,
        const vector< vector<long long> > & RowStencil,
        const vector<long long> & RowIndices,
        const Epetra_Comm & GlobalComm  ) 
  : Epetra_CrsMatrix( Copy, *(BlockUtility::GenerateBlockGraph( BaseGraph, RowStencil, RowIndices, GlobalComm )) ),
    BaseGraph_( BaseGraph ),
    RowStencil_LL_( RowStencil ),
    RowIndices_LL_( RowIndices ),
    ROffset_(BlockUtility::CalculateOffset64(BaseGraph.RowMap())),
    COffset_(BlockUtility::CalculateOffset64(BaseGraph.ColMap()))
{
}
BlockCrsMatrix::BlockCrsMatrix(
        const Epetra_CrsGraph & BaseGraph,
        const vector<int> & RowStencil,
        int RowIndex,
        const Epetra_Comm & GlobalComm  ) 
  : Epetra_CrsMatrix( Copy, *(BlockUtility::GenerateBlockGraph( BaseGraph, vector< vector<int> >(1,RowStencil), vector<int>(1,RowIndex), GlobalComm )) ),
    BaseGraph_( BaseGraph ),
    RowStencil_int_( vector< vector<int> >(1,RowStencil) ),
    RowIndices_int_( vector<int>(1,RowIndex) ),
    ROffset_(BlockUtility::CalculateOffset64(BaseGraph.RowMap())),
    COffset_(BlockUtility::CalculateOffset64(BaseGraph.ColMap()))
{
}
Example #6
0
void 
BroydenOperator::removeEntriesFromBroydenUpdate( const Epetra_CrsGraph & graph )
{

  int numRemoveIndices ;
  int * removeIndPtr   ;
  int ierr             ;

  cout << graph << endl;

  for( int row = 0; row < graph.NumMyRows(); ++row) 
  {
    ierr = graph.ExtractMyRowView( row, numRemoveIndices, removeIndPtr );
    if( ierr )
    {
      cout << "ERROR (" << ierr << ") : "
           << "NOX::Epetra::BroydenOperator::removeEntriesFromBroydenUpdate(...)"
           << " - Extract indices error for row --> " << row << endl;
      throw "NOX Broyden Operator Error";
    }

    if( 0 != numRemoveIndices )
    {
      // Create a map for quick queries
      map<int, bool> removeIndTable;
      for( int k = 0; k < numRemoveIndices; ++k )
        removeIndTable[ graph.ColMap().GID(removeIndPtr[k]) ] = true;

      // Get our matrix column indices for the current row
      int numOrigIndices = 0;
      int * indPtr;

      ierr = crsMatrix->Graph().ExtractMyRowView( row, numOrigIndices, indPtr );
      if( ierr )
      {
        cout << "ERROR (" << ierr << ") : "
             << "NOX::Epetra::BroydenOperator::removeEntriesFromBroydenUpdate(...)"
             << " - Extract indices error for row --> " << row << endl;
        throw "NOX Broyden Operator Error";
      }

      // Remove appropriate active entities
      if( retainedEntries.end() == retainedEntries.find(row) )
      {
        list<int> inds;

        for( int k = 0; k < numOrigIndices; ++k )
        {
          if( removeIndTable.end() == removeIndTable.find( crsMatrix->Graph().ColMap().GID(indPtr[k]) ) )
            inds.push_back(k);
        }

        retainedEntries[row] = inds;
      }
      else
      {
        list<int> & inds = retainedEntries[row];

        list<int>::iterator iter     = inds.begin() ,
                            iter_end = inds.end()    ;

        for( ; iter_end != iter; ++iter )
        {
          if( !removeIndTable[ *iter ] )
            inds.remove( *iter );
        }
      }

      entriesRemoved[row] = true;
    }
  }

  return;
}
void show_matrix(const char *txt, const Epetra_CrsGraph &graph, const Epetra_Comm &comm)
{
  int me = comm.MyPID();

  if (comm.NumProc() > 10){
    if (me == 0){
      std::cerr << txt << std::endl;
      std::cerr << "Printed matrix format only works for 10 or fewer processes" << std::endl;
    }
    return;
  }

  const Epetra_BlockMap &rowmap = graph.RowMap();
  const Epetra_BlockMap &colmap = graph.ColMap();

  int myRows = rowmap.NumMyElements();
  int numRows = graph.NumGlobalRows();
  int numCols = graph.NumGlobalCols();
  int base = rowmap.IndexBase();

  if ((numRows > 200) || (numCols > 500)){
    if (me == 0){
      std::cerr << txt << std::endl;
      std::cerr << "show_matrix: problem is too large to display" << std::endl;
    }
    return;
  }

  int *myA = new int [numRows * numCols];
  memset(myA, 0, sizeof(int) * numRows * numCols);

  int *myIndices;

  int *myRowGIDs = rowmap.MyGlobalElements();

  for (int i=0; i< myRows; i++){
    int myRowLID = rowmap.LID(myRowGIDs[i]);

    int numEntries = graph.NumMyIndices(myRowLID);

    if (numEntries > 0){
      int rc = graph.ExtractMyRowView(myRowLID, numEntries, myIndices);
      if (rc){
        std::cerr << txt << std::endl;
        std::cerr << "extract graph error" << std::endl;
        return;
      }

      int *row = myA + (numCols * (myRowGIDs[i] - base));

      for (int j=0; j < numEntries; j++){
        int gid = colmap.GID(myIndices[j]);
        row[gid-base] = me+1;
      }
    }
  }

  printMatrix(txt, myA, NULL, NULL, numRows, numCols, comm);

  delete [] myA;
}
// ============================================================================ 
int ML_Epetra::MatrixFreePreconditioner::
Compute(const Epetra_CrsGraph& Graph, Epetra_MultiVector& NullSpace)
{
  Epetra_Time TotalTime(Comm());

  const int NullSpaceDim = NullSpace.NumVectors();
  // get parameters from the list
  std::string PrecType = List_.get("prec: type", "hybrid");
  std::string SmootherType = List_.get("smoother: type", "Jacobi");
  std::string ColoringType = List_.get("coloring: type", "JONES_PLASSMAN");
  int PolynomialDegree = List_.get("smoother: degree", 3);
  std::string DiagonalColoringType = List_.get("diagonal coloring: type", "JONES_PLASSMAN");
  int MaximumIterations = List_.get("eigen-analysis: max iters", 10);
  std::string EigenType_ = List_.get("eigen-analysis: type", "cg");
  double boost = List_.get("eigen-analysis: boost for lambda max", 1.0);
  int OutputLevel = List_.get("ML output", -47);
  if (OutputLevel == -47) OutputLevel =  List_.get("output", 10);
  omega_ = List_.get("smoother: damping", omega_);
  ML_Set_PrintLevel(OutputLevel);
  bool LowMemory = List_.get("low memory", true);
  double AllocationFactor = List_.get("AP allocation factor", 0.5);

  verbose_ = (MyPID() == 0 && ML_Get_PrintLevel() > 5);

  // ================ //
  // check parameters //
  // ================ //

  if (PrecType == "presmoother only")
    PrecType_ = ML_MFP_PRESMOOTHER_ONLY;
  else if (PrecType == "hybrid")
    PrecType_ = ML_MFP_HYBRID;
  else if (PrecType == "additive")
    PrecType_ = ML_MFP_ADDITIVE;
  else
    ML_CHK_ERR(-3); // not recognized

  if (SmootherType == "none")
    SmootherType_ = ML_MFP_NONE;
  else if (SmootherType == "Jacobi")
    SmootherType_ = ML_MFP_JACOBI;
  else if (SmootherType == "block Jacobi")
    SmootherType_ = ML_MFP_BLOCK_JACOBI;
  else if (SmootherType == "Chebyshev")
    SmootherType_ = ML_MFP_CHEBY;
  else
    ML_CHK_ERR(-4); // not recognized

  if (AllocationFactor <= 0.0)
    ML_CHK_ERR(-1); // should be positive

  // =============================== //
  // basic checkings and some output //
  // =============================== //
  
  int OperatorDomainPoints =  Operator_.OperatorDomainMap().NumGlobalPoints();
  int OperatorRangePoints =  Operator_.OperatorRangeMap().NumGlobalPoints();
  int GraphBlockRows = Graph.NumGlobalBlockRows();
  int GraphNnz = Graph.NumGlobalNonzeros();
  NumPDEEqns_ = OperatorRangePoints / GraphBlockRows;
  NumMyBlockRows_ = Graph.NumMyBlockRows();

  if (OperatorDomainPoints != OperatorRangePoints)
    ML_CHK_ERR(-1); // only square matrices

  if (OperatorRangePoints % NumPDEEqns_ != 0)
    ML_CHK_ERR(-2); // num PDEs seems not constant

  if (verbose_)
  {
    ML_print_line("=",78);
    std::cout << "*** " << std::endl;
    std::cout << "*** ML_Epetra::MatrixFreePreconditioner" << std::endl;
    std::cout << "***" << std::endl;
    std::cout << "Number of rows and columns      = " << OperatorDomainPoints << std::endl;
    std::cout << "Number of rows per processor    = " << OperatorDomainPoints / Comm().NumProc()
         << " (on average)" << std::endl;
    std::cout << "Number of rows in the graph     = " << GraphBlockRows << std::endl;
    std::cout << "Number of nonzeros in the graph = " << GraphNnz << std::endl;
    std::cout << "Processors used in computation  = " << Comm().NumProc() << std::endl;
    std::cout << "Number of PDE equations         = " << NumPDEEqns_ << std::endl;
    std::cout << "Null space dimension            = " << NullSpaceDim << std::endl;
    std::cout << "Preconditioner type             = " << PrecType << std::endl;
    std::cout << "Smoother type                   = " << SmootherType << std::endl;
    std::cout << "Coloring type                   = " << ColoringType << std::endl;
    std::cout << "Allocation factor               = " << AllocationFactor << std::endl;
    std::cout << "Number of V-cycles for C        = " << List_.sublist("ML list").get("cycle applications", 1) << std::endl;
    std::cout << std::endl;
  }

  ResetStartTime();

  // ==================================== //
  // compute the inverse of the diagonal, //
  // control that no elements are zero.   //
  // ==================================== //
  
  for (int i = 0; i < InvPointDiagonal_->MyLength(); ++i)
    if ((*InvPointDiagonal_)[i] != 0.0)
      (*InvPointDiagonal_)[i] = 1.0 / (*InvPointDiagonal_)[i];

  // ========================================================= //
  // Setup the smoother. I need to extract the block diagonal  //
  // only if block Jacobi is used. For Chebyshev, I scale with //
  // the point diagonal only. In this latter case, I need to   //
  // compute lambda_max of the scaled operator.                //
  // ========================================================= //
  
  // probes for the block diagonal of the matrix.
  if (SmootherType_ == ML_MFP_JACOBI ||
      SmootherType_ == ML_MFP_NONE)
  {
    // do-nothing here
  }
  else if (SmootherType_ == ML_MFP_BLOCK_JACOBI)
  {
    if (verbose_);
      std::cout << "Diagonal coloring type         = " << DiagonalColoringType << std::endl;
    ML_CHK_ERR(GetBlockDiagonal(Graph, DiagonalColoringType));

    AddAndResetStartTime("block diagonal construction", true);
  }
  else if (SmootherType_ == ML_MFP_CHEBY)
  {
    double lambda_min = 0.0;
    double lambda_max = 0.0;
    Teuchos::ParameterList IFPACKList;

    if (EigenType_ == "power-method")
    {
      ML_CHK_ERR(Ifpack_Chebyshev::PowerMethod(Operator_, *InvPointDiagonal_,
                                               MaximumIterations, lambda_max));
    }
    else if(EigenType_ == "cg")
    {
      ML_CHK_ERR(Ifpack_Chebyshev::CG(Operator_, *InvPointDiagonal_,
                                      MaximumIterations, lambda_min, 
                                      lambda_max));
    }
    else
      ML_CHK_ERR(-1); // not recognized

    if (verbose_)
    {
      std::cout << "Using Chebyshev smoother of degree " << PolynomialDegree << std::endl;
      std::cout << "Estimating eigenvalues using " <<  EigenType_ << std::endl;
      std::cout << "lambda_min = " << lambda_min << ", ";
      std::cout << "lambda_max = " << lambda_max << std::endl;
    }

    IFPACKList.set("chebyshev: min eigenvalue", lambda_min);
    IFPACKList.set("chebyshev: max eigenvalue", boost * lambda_max);
    // FIXME: this allocates a new std::vector inside
    IFPACKList.set("chebyshev: operator inv diagonal", InvPointDiagonal_.get());
    IFPACKList.set("chebyshev: degree", PolynomialDegree);

    PreSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_)));
    if (PreSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error?

    IFPACKList.set("chebyshev: zero starting solution", true);
    ML_CHK_ERR(PreSmoother_->SetParameters(IFPACKList));
    ML_CHK_ERR(PreSmoother_->Initialize());
    ML_CHK_ERR(PreSmoother_->Compute());

    PostSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_)));
    if (PostSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error?

    IFPACKList.set("chebyshev: zero starting solution", false);
    ML_CHK_ERR(PostSmoother_->SetParameters(IFPACKList));
    ML_CHK_ERR(PostSmoother_->Initialize());
    ML_CHK_ERR(PostSmoother_->Compute());
  }

  // ========================================================= //
  // building P and R for block graph. This is done by working //
  // on the Graph_ object. Support is provided for local       //
  // aggregation schemes only so that all is basically local.  //
  // Then, build the block graph coarse problem.               //
  // ========================================================= //
  
  // ML wrapper for Graph_
  ML_Operator* Graph_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraCrsGraph(const_cast<Epetra_CrsGraph*>(&Graph), Graph_ML);

  ML_Aggregate* BlockAggr_ML = 0;
  ML_Operator* BlockPtent_ML = 0, *BlockRtent_ML = 0,* CoarseGraph_ML = 0;

  if (verbose_) std::cout << std::endl;

  ML_CHK_ERR(Coarsen(Graph_ML, &BlockAggr_ML, &BlockPtent_ML, &BlockRtent_ML, 
                     &CoarseGraph_ML));

  if (verbose_) std::cout << std::endl;

  Epetra_CrsMatrix* GraphCoarse;
  ML_CHK_ERR(ML_Operator2EpetraCrsMatrix(CoarseGraph_ML, GraphCoarse));

  // used later to estimate the entries in AP
  ML_Operator* CoarseAP_ML = ML_Operator_Create(Comm_ML());
  ML_2matmult(Graph_ML, BlockPtent_ML, CoarseAP_ML, ML_CSR_MATRIX);

  int AP_MaxNnzRow, itmp = CoarseAP_ML->max_nz_per_row;
  Comm().MaxAll(&itmp, &AP_MaxNnzRow, 1);
  ML_Operator_Destroy(&CoarseAP_ML);

  int NumAggregates = BlockPtent_ML->invec_leng;
  ML_Operator_Destroy(&BlockRtent_ML);
  ML_Operator_Destroy(&CoarseGraph_ML);

  AddAndResetStartTime("construction of block C, R, and P", true);
  if (verbose_) std::cout << std::endl;

  // ================================================== //
  // coloring of block graph:                           //
  // - color of block row `i' is given by `ColorMap[i]' //
  // - number of colors is ColorMap.NumColors().        //
  // ================================================== //
  
  ResetStartTime();

  CrsGraph_MapColoring* MapColoringTransform;
  
  if (ColoringType == "JONES_PLASSMAN")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::JONES_PLASSMAN,
                                                     0, false, 0);
  else if (ColoringType == "PSEUDO_PARALLEL")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::PSEUDO_PARALLEL,
                                                     0, false, 0);
  else if (ColoringType == "GREEDY")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::GREEDY,
                                                     0, false, 0);
  else if (ColoringType == "LUBY")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::LUBY,
                                                     0, false, 0);
  else 
    ML_CHK_ERR(-1);

  Epetra_MapColoring* ColorMap = &(*MapColoringTransform)(const_cast<Epetra_CrsGraph&>(GraphCoarse->Graph()));

  // move the information from ColorMap to std::vector Colors
  const int NumColors = ColorMap->MaxNumColors();
  RefCountPtr<Epetra_IntSerialDenseVector> Colors = rcp(new Epetra_IntSerialDenseVector(GraphCoarse->Graph().NumMyRows()));
  for (int i = 0; i < GraphCoarse->Graph().NumMyRows(); ++i)
    (*Colors)[i] = (*ColorMap)[i];

  delete MapColoringTransform;
  delete ColorMap; ColorMap = 0;
  delete GraphCoarse;

  AddAndResetStartTime("coarse graph coloring", true);
  if (verbose_) std::cout << std::endl;

  // get some other information about the aggregates, to be used
  // in the QR factorization of the null space. NodesOfAggregate
  // contains the local ID of block rows contained in each aggregate.

  // FIXME: make it faster
  std::vector< std::vector<int> > NodesOfAggregate(NumAggregates);

  for (int i = 0; i < Graph.NumMyBlockRows(); ++i)
  {
    int AID = BlockAggr_ML->aggr_info[0][i];
    NodesOfAggregate[AID].push_back(i);
  }

  int MaxAggrSize = 0;
  for (int i = 0; i < NumAggregates; ++i)
  {
    const int& MySize = NodesOfAggregate[i].size();
    if (MySize > MaxAggrSize) MaxAggrSize = MySize;
  }

  // collect aggregate information, and mark all nodes that are
  // connected with each aggregate. These nodes will have a possible
  // nonzero entry after the matrix-matrix product between the Operator_
  // and the tentative prolongator.

  std::vector<vector<int> > aggregates(NumAggregates);
  std::vector<int>::iterator iter;

  for (int i = 0; i < NumAggregates; ++i)
    aggregates[i].reserve(MaxAggrSize);

  for (int i = 0; i < Graph.NumMyBlockRows(); ++i)
  {
    int AID = BlockAggr_ML->aggr_info[0][i];

    int NumEntries;
    int* Indices;

    Graph.ExtractMyRowView(i, NumEntries, Indices);

    for (int k = 0; k < NumEntries; ++k)
    {
      // FIXME: use hash??
      const int& GCID = Graph.ColMap().GID(Indices[k]);

      iter = find(aggregates[AID].begin(), aggregates[AID].end(), GCID);
      if (iter == aggregates[AID].end())
        aggregates[AID].push_back(GCID);
    }
  }
  
  int* BlockNodeList = Graph.ColMap().MyGlobalElements();

  // finally get rid of the ML_Aggregate structure.
  ML_Aggregate_Destroy(&BlockAggr_ML);

  const Epetra_Map& FineMap = Operator_.OperatorDomainMap();
  Epetra_Map CoarseMap(-1, NumAggregates * NullSpaceDim, 0, Comm());
  RefCountPtr<Epetra_Map> BlockNodeListMap = 
    rcp(new Epetra_Map(-1, Graph.ColMap().NumMyElements(),
                       BlockNodeList, 0, Comm()));

  std::vector<int> NodeList(Graph.ColMap().NumMyElements() * NumPDEEqns_);
  for (int i = 0; i < Graph.ColMap().NumMyElements(); ++i)
    for (int m = 0; m < NumPDEEqns_; ++m)
      NodeList[i * NumPDEEqns_ + m] = BlockNodeList[i] * NumPDEEqns_ + m;
  RefCountPtr<Epetra_Map> NodeListMap = 
    rcp(new Epetra_Map(-1, NodeList.size(), &NodeList[0], 0, Comm()));

  AddAndResetStartTime("data structures", true);

  // ====================== //
  // process the null space //
  // ====================== //

  // CHECKME
  Epetra_MultiVector NewNullSpace(CoarseMap, NullSpaceDim);
  NewNullSpace.PutScalar(0.0);

  if (NullSpaceDim == 1)
  {
    double* ns_ptr = NullSpace.Values();

    for (int AID = 0; AID < NumAggregates; ++AID)
    {
      double dtemp = 0.0;
      for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++)
        for (int m = 0; m < NumPDEEqns_; ++m)
        {
          const int& pos = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
          dtemp += (ns_ptr[pos] * ns_ptr[pos]);
        }
      dtemp = std::sqrt(dtemp);

      NewNullSpace[0][AID] = dtemp;

      dtemp = 1.0 / dtemp;

      for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++)
        for (int m = 0; m < NumPDEEqns_; ++m)
          ns_ptr[NodesOfAggregate[AID][j] * NumPDEEqns_ + m] *= dtemp;
    }
  }
  else
  {
    // FIXME
    std::vector<double> qr_ptr(MaxAggrSize * NumPDEEqns_ * MaxAggrSize * NumPDEEqns_);
    std::vector<double> tmp_ptr(MaxAggrSize * NumPDEEqns_ * NullSpaceDim);

    std::vector<double> work(NullSpaceDim);
    int info;

    for (int AID = 0; AID < NumAggregates; ++AID)
    {
      int MySize = NodesOfAggregate[AID].size();
      int MyFullSize = NodesOfAggregate[AID].size() * NumPDEEqns_;
      int lwork = NullSpaceDim;

      for (int k = 0; k < NullSpaceDim; ++k)
        for (int j = 0; j < MySize; ++j)
          for (int m = 0; m < NumPDEEqns_; ++m)
            qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m] = 
              NullSpace[k][NodesOfAggregate[AID][j] * NumPDEEqns_ + m];

      DGEQRF_F77(&MyFullSize, (int*)&NullSpaceDim, &qr_ptr[0], 
                 &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info);

      ML_CHK_ERR(info);

      if (work[0] > lwork) work.resize((int) work[0]);

      // the upper triangle of qr_tmp is now R, so copy that into the 
      //  new nullspace

      for (int j = 0; j < NullSpaceDim; j++)
        for (int k = j; k < NullSpaceDim; k++)
          NewNullSpace[k][AID * NullSpaceDim + j] = qr_ptr[j + MyFullSize * k];
		 
      // to get this block of P, need to run qr_tmp through another LAPACK 
      // function:

      DORGQR_F77(&MyFullSize, (int*)&NullSpaceDim, (int*)&NullSpaceDim, 
                 &qr_ptr[0], &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info);
      ML_CHK_ERR(info); // dgeqtr returned a non-zero

      if (work[0] > lwork) work.resize((int) work[0]);

      // insert the Q block into the null space

      for (int k = 0; k < NullSpaceDim; ++k)
        for (int j = 0; j < MySize; ++j)
          for (int m = 0; m < NumPDEEqns_; ++m)
          {
            int LRID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
            double& val = qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m];
            NullSpace[k][LRID] = val;
          }
    }
  }

  AddAndResetStartTime("null space setup", true);

  if (verbose_)
    std::cout << "Number of colors on processor " << Comm().MyPID() << " = "
        << NumColors << std::endl;
  if (verbose_)
    std::cout << "Maximum number of colors = " << NumColors << std::endl;

  RefCountPtr<Epetra_FECrsMatrix> AP;
  
  // try to get a good estimate of the nonzeros per row.
  // This is a compromize between efficiency -- that is, reduce
  // the memory allocation processes, and memory usage -- that, is
  // overestimating can actually kill the code. Basically, this is
  // all junk due to our dear friend, the Cray XT3.
  
  AP = rcp(new Epetra_FECrsMatrix(Copy, FineMap, (int)
                                  (AllocationFactor * AP_MaxNnzRow * NullSpaceDim)));
  if (AP.get() == 0) throw(-1);

  if (!LowMemory)
  {
    // ================================================= //
    // allocate one big chunk of memory, and use View    //             
    // to create Epetra_MultiVectors. Note that          //
    // NumColors * NullSpace can indeed be a quite large //
    // value. To reduce the memory consumption, both     //
    // ColoredAP and ExtColoredAP use the same memory    //
    // array.                                            //
    // ================================================= //
    
    Epetra_MultiVector* ColoredP;
    std::vector<double> ColoredAP_ptr;

    try
    {
      ColoredP = new Epetra_MultiVector(FineMap, NumColors * NullSpaceDim);
      ColoredAP_ptr.resize(NumColors * NullSpaceDim * NodeListMap->NumMyPoints());
    }
    catch (std::exception& rhs)
    {
      catch_message("the allocation of ColoredP", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("the allocation of ColoredP", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    int ColoredAP_LDA = NodeListMap->NumMyPoints();

    ColoredP->PutScalar(0.0);

    for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i)
    {
      int allocated = 1;
      int NumEntries;
      int Indices;
      double Values;
      int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated,
                                    &Indices,&Values,&NumEntries);
      if (ierr < 0)
        ML_CHK_ERR(-1);

      assert (NumEntries == 1); // this is the block P
      const int& Color = (*Colors)[Indices] - 1;
      for (int k = 0; k < NumPDEEqns_; ++k)
        for (int j = 0; j < NullSpaceDim; ++j)
          (*ColoredP)[(Color * NullSpaceDim + j)][i * NumPDEEqns_ + k] = 
            NullSpace[j][i * NumPDEEqns_ + k];
    }

    ML_Operator_Destroy(&BlockPtent_ML);

    Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), 
                                 &ColoredAP_ptr[0], ColoredAP_LDA, 
                                 NumColors * NullSpaceDim);
    // move ColoredAP into ColoredP. This should not be required.
    // but I prefer to skip strange games with View pointers
    Operator_.Apply(*ColoredP, ColoredAP);
    *ColoredP = ColoredAP;

    // FIXME: only if NumProc > 1
    Epetra_MultiVector ExtColoredAP(View, *NodeListMap, 
                                 &ColoredAP_ptr[0], ColoredAP_LDA, 
                                 NumColors * NullSpaceDim);

    try 
    {
      Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap());
      ExtColoredAP.Import(*ColoredP, Importer, Insert);
    }
    catch (std::exception& rhs)
    {
      catch_message("importing of ExtColoredAP", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("importing of ExtColoredAP", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    delete ColoredP;

    AddAndResetStartTime("computation of AP", true); 

    // populate the actual AP operator, skip some controls to make it faster

    for (int i = 0; i < NumAggregates; ++i)
    {
      for (int j = 0; j < (int) (aggregates[i].size()); ++j)
      {
        int GRID = aggregates[i][j];
        int LRID = BlockNodeListMap->LID(GRID); // this is the block ID
        //assert (LRID != -1);
        int GCID = CoarseMap.GID(i * NullSpaceDim);
        //assert (GCID != -1); 
        int color = (*Colors)[i] - 1;
        for (int k = 0; k < NumPDEEqns_; ++k)
          for (int j = 0; j < NullSpaceDim; ++j)
          {
            double val = ExtColoredAP[color * NullSpaceDim + j][LRID * NumPDEEqns_ + k];
            if (val != 0.0)
            {
              int GRID2 = GRID * NumPDEEqns_ + k;
              int GCID2 = GCID + j;
              AP->InsertGlobalValues(1, &GRID2, 1, &GCID2, &val);
              //if (ierr < 0) ML_CHK_ERR(ierr);
            }
          }
      }
    }
  }
  else
  {
    // =============================================================== //
    // apply the operator one color at-a-time. This requires NumColors //
    // cycles over BlockPtent. However, the memory requirements are    //
    // drastically reduced. As for low-memory == false, both ColoredAP //
    // and ExtColoredAP point to the same memory location.             //
    // =============================================================== //
    
    if (verbose_)
      std::cout << "Using low-memory computation for AP" << std::endl;

    Epetra_MultiVector ColoredP(FineMap, NullSpaceDim);
    std::vector<double> ColoredAP_ptr;
    try
    {
      ColoredAP_ptr.resize(NullSpaceDim * NodeListMap->NumMyPoints());
    }
    catch (std::exception& rhs)
    {
      catch_message("resizing of ColoredAP_pt", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("resizing of ColoredAP_pt", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), 
                                 &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), 
                                 NullSpaceDim);
    Epetra_MultiVector ExtColoredAP(View, *NodeListMap, 
                                 &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), 
                                 NullSpaceDim);
    Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap());

    for (int ic = 0; ic < NumColors; ++ic)
    {
      if (ML_Get_PrintLevel() > 8 && Comm().MyPID() == 0)
      {
        if (ic % 20 == 0)
          std::cout << "Processing color " << flush;

        std::cout << ic << " " << flush;
        if (ic % 20 == 19 || ic == NumColors - 1)
          std::cout << std::endl;
        if (ic == NumColors - 1) std::cout << std::endl;
      }

      ColoredP.PutScalar(0.0);

      for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i)
      {
        int allocated = 1;
        int NumEntries;
        int Indices;
        double Values;
        int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated,
                                      &Indices,&Values,&NumEntries);
        if (ierr < 0 ||  // something strange in getrow
            NumEntries != 1) // this is the block P
          ML_CHK_ERR(-1);

        const int& Color = (*Colors)[Indices] - 1;
        if (Color != ic)
          continue; // skip this color for this cycle

        for (int k = 0; k < NumPDEEqns_; ++k)
          for (int j = 0; j < NullSpaceDim; ++j)
            ColoredP[j][i * NumPDEEqns_ + k] = 
              NullSpace[j][i * NumPDEEqns_ + k];
      }

      Operator_.Apply(ColoredP, ColoredAP);
      ColoredP = ColoredAP; // just to be safe

      ExtColoredAP.Import(ColoredP, Importer, Insert);

      // populate the actual AP operator, skip some controls to make it faster

      std::vector<int> InsertCols(NullSpaceDim * NumPDEEqns_);
      std::vector<double> InsertValues(NullSpaceDim * NumPDEEqns_);

      for (int i = 0; i < NumAggregates; ++i)
      {
        for (int j = 0; j < (int) (aggregates[i].size()); ++j)
        {
          int GRID = aggregates[i][j];
          int LRID = BlockNodeListMap->LID(GRID); // this is the block ID
          //assert (LRID != -1);
          int GCID = CoarseMap.GID(i * NullSpaceDim);
          //assert (GCID != -1); 
          int color = (*Colors)[i] - 1;
          if (color != ic) continue;

          for (int k = 0; k < NumPDEEqns_; ++k)
          {
            int count = 0;
            int GRID2 = GRID * NumPDEEqns_ + k;
            for (int j = 0; j < NullSpaceDim; ++j)
            {
              double val = ExtColoredAP[j][LRID * NumPDEEqns_ + k];
              if (val != 0.0)
              {
                InsertCols[count] = GCID + j;
                InsertValues[count] = val;
                ++count;
              }
            }
            AP->InsertGlobalValues(1, &GRID2, count, &InsertCols[0], 
                                   &InsertValues[0]);
          }
        }
      }
    }

    ML_Operator_Destroy(&BlockPtent_ML);
  }

  aggregates.resize(0);
  BlockNodeListMap = Teuchos::null;
  NodeListMap = Teuchos::null;

  Colors = Teuchos::null;

  AP->GlobalAssemble(false);
  AP->FillComplete(CoarseMap, FineMap);

#if 0
  try
  {
    AP->OptimizeStorage();
  }
  catch(...)
  {
    // a memory error was reported, typically ReportError.
    // We just continue with fingers crossed.
  }
#endif

  AddAndResetStartTime("computation of the final AP", true); 

  ML_Operator* AP_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraMatrix(AP.get(), AP_ML);

  // ======== //
  // create R //
  // ======== //
  
  std::vector<int> REntries(NumAggregates * NullSpaceDim);
  for (int AID = 0; AID < NumAggregates; ++AID)
  {
    for (int m = 0; m < NullSpaceDim; ++m)
      REntries[AID * NullSpaceDim + m] = NodesOfAggregate[AID].size() * NumPDEEqns_;
  }

  R_ = rcp(new Epetra_CrsMatrix(Copy, CoarseMap, &REntries[0], true));
  REntries.resize(0);

  for (int AID = 0; AID < NumAggregates; ++AID)
  {
    const int& MySize = NodesOfAggregate[AID].size();

    // FIXME: make it faster
    for (int j = 0; j < MySize; ++j)
      for (int m = 0; m < NumPDEEqns_; ++m)
        for (int k = 0; k < NullSpaceDim; ++k)
        {
          int LCID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
          int GCID = FineMap.GID(LCID);
          assert (GCID != -1);

          double& val = NullSpace[k][LCID];

          int GRID = CoarseMap.GID(AID * NullSpaceDim + k);
          int ierr = R_->InsertGlobalValues(GRID, 1, &val, &GCID);
          if (ierr < 0)
            ML_CHK_ERR(-1);
        }
  }

  NodesOfAggregate.resize(0);

  R_->FillComplete(FineMap, CoarseMap);
#if 0
  try
  {
    R_->OptimizeStorage();
  }
  catch(...)
  {
    // a memory error was reported, typically ReportError.
    // We just continue with fingers crossed.
  }
#endif

  ML_Operator* R_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraMatrix(R_.get(), R_ML);

  AddAndResetStartTime("computation of R", true); 

  // ======== //
  // Create C //
  // ======== //

  C_ML_ = ML_Operator_Create(Comm_ML());
  ML_2matmult(R_ML, AP_ML, C_ML_, ML_MSR_MATRIX);

  ML_Operator_Destroy(&AP_ML);
  ML_Operator_Destroy(&R_ML);
  AP = Teuchos::null;

  C_ = rcp(new ML_Epetra::RowMatrix(C_ML_, &Comm(), false));
  assert (R_->OperatorRangeMap().SameAs(C_->OperatorDomainMap()));

  TotalTime.ResetStartTime();

  AddAndResetStartTime("computation of C", true); 

  if (verbose_)
  {
    std::cout << "Matrix-free preconditioner built. Now building solver for C..." << std::endl; 
  }

  Teuchos::ParameterList& sublist = List_.sublist("ML list");
  sublist.set("PDE equations", NullSpaceDim);
  sublist.set("null space: type", "pre-computed");
  sublist.set("null space: dimension", NewNullSpace.NumVectors());
  sublist.set("null space: vectors", NewNullSpace.Values());

  MLP_ = rcp(new MultiLevelPreconditioner(*C_, sublist, true));

  assert (MLP_.get() != 0);

  IsComputed_ = true;

  AddAndResetStartTime("computation of the preconditioner for C", true); 

  if (verbose_)
  {
    std::cout << std::endl;
    std::cout << "Total CPU time for construction (all included) = ";
    std::cout << TotalCPUTime() << std::endl;
    ML_print_line("=",78);
  }

  return(0);
}