        const Epetra_Map & Map,
        double a,
        double diag,
        double c
    ) :
        // global number of rows
        int NumGlobalElements = Map.NumGlobalElements();
        // local number of rows
        int NumMyElements = Map.NumMyElements();
        // get update list
        int * MyGlobalElements = new int [NumMyElements];
        Map.MyGlobalElements( MyGlobalElements );

        // Add  rows one-at-a-time
        // Need some vectors to help
        // Off diagonal Values will always be -1

        double *Values = new double[2];
        Values[0] = a;
        Values[1] = c;
        int *Indices = new int[2];
        int NumEntries;

        for( int i=0 ; i<NumMyElements; ++i ) {
            if (MyGlobalElements[i]==0) {
                Indices[0] = 1;
                NumEntries = 1;
            } else if (MyGlobalElements[i] == NumGlobalElements-1) {
                Indices[0] = NumGlobalElements-2;
                NumEntries = 1;
            } else {
                Indices[0] = MyGlobalElements[i]-1;
                Indices[1] = MyGlobalElements[i]+1;
                NumEntries = 2;
            InsertGlobalValues(MyGlobalElements[i], NumEntries, Values, Indices);
            // Put in the diagonal entry
            InsertGlobalValues(MyGlobalElements[i], 1, &diag, MyGlobalElements+i);

        // Finish up

        delete [] MyGlobalElements;
        delete [] Values;
        delete [] Indices;
void build_test_map(const Epetra_Map & oldMap, Epetra_Map *& newMap) {
  int NumProc = oldMap.Comm().NumProc();
  int MyPID   = oldMap.Comm().MyPID();

  int num_global = oldMap.NumGlobalElements();
  if(NumProc<3) {
    // Dump everything onto -proc 0
    int num_local = MyPID==0 ? num_global : 0;
    newMap = new Epetra_Map(num_global,num_local,0,oldMap.Comm());
  else {
    // Split everything between procs 0 and 2 (leave proc 1 empty)
    int num_local=0;
    if(MyPID==0) num_local = num_global/2;
    else if(MyPID==2) num_local =  num_global - ((int)num_global/2);
    newMap = new Epetra_Map(num_global,num_local,0,oldMap.Comm());
Teuchos::RCP<Epetra_CrsMatrix> Epetra_Operator_to_Epetra_Matrix::constructInverseMatrix(const Epetra_Operator &op, const Epetra_Map &map)
  int numEntriesPerRow = 0;
  Teuchos::RCP<Epetra_FECrsMatrix> matrix = Teuchos::rcp(new Epetra_FECrsMatrix(::Copy, map, numEntriesPerRow));

  int numRows = map.NumGlobalElements();

  Epetra_Vector X(map);
  Epetra_Vector Y(map);

  double tol = 1e-15; // values below this will be considered 0

  for (int rowIndex=0; rowIndex<numRows; rowIndex++)
    int lid = map.LID(rowIndex);
    if (lid != -1)
      X[lid] = 1.0;
    op.ApplyInverse(X, Y);
    if (lid != -1)
      X[lid] = 0.0;

    std::vector<double> values;
    std::vector<int> indices;
    for (int i=0; i<map.NumMyElements(); i++)
      if (abs(Y[i]) > tol)

    matrix->InsertGlobalValues(rowIndex, values.size(), &values[0], &indices[0]);

  return matrix;
int main(int argc, char *argv[]) {

  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  cout << Comm << endl;

  int MyPID = Comm.MyPID();

  bool verbose = false; 
  if (MyPID==0) verbose = true;

  if(argc < 2 && verbose) {
    cerr << "Usage: " << argv[0] 
	 << " HB_filename [level_fill [level_overlap [absolute_threshold [ relative_threshold]]]]" << endl
	 << "where:" << endl
	 << "HB_filename        - filename and path of a Harwell-Boeing data set" << endl
	 << "level_fill         - The amount of fill to use for ILU(k) preconditioner (default 0)" << endl
	 << "level_overlap      - The amount of overlap used for overlapping Schwarz subdomains (default 0)" << endl
	 << "absolute_threshold - The minimum value to place on the diagonal prior to factorization (default 0.0)" << endl
	 << "relative_threshold - The relative amount to perturb the diagonal prior to factorization (default 1.0)" << endl << endl
	 << "To specify a non-default value for one of these parameters, you must specify all" << endl
	 << " preceding values but not any subsequent parameters. Example:" << endl
	 << "ifpackHbSerialMsr.exe mymatrix.hb 1  - loads mymatrix.hb, uses level fill of one, all other values are defaults" << endl
	 << endl;


  // Uncomment the next three lines to debug in mpi mode
  //int tmp;
  //if (MyPID==0) cin >> tmp;

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  // Call routine to read in HB problem
  Trilinos_Util_ReadHb2Epetra(argv[1], Comm, readMap, readA, readx, readb, readxexact);

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);
  Epetra_Vector x(map);
  Epetra_Vector b(map);
  Epetra_Vector xexact(map);

  Epetra_Time FillTimer(Comm);
  x.Export(*readx, exporter, Add);
  b.Export(*readb, exporter, Add);
  xexact.Export(*readxexact, exporter, Add);
  double vectorRedistributeTime = FillTimer.ElapsedTime();
  A.Export(*readA, exporter, Add);
  double matrixRedistributeTime = FillTimer.ElapsedTime() - vectorRedistributeTime;
  double fillCompleteTime = FillTimer.ElapsedTime() - matrixRedistributeTime;
  if (Comm.MyPID()==0)	{
    cout << "\n\n****************************************************" << endl;
    cout << "\n Vector redistribute  time (sec) = " << vectorRedistributeTime<< endl;
    cout << "    Matrix redistribute time (sec) = " << matrixRedistributeTime << endl;
    cout << "    Transform to Local  time (sec) = " << fillCompleteTime << endl<< endl;
  Epetra_Vector tmp1(*readMap);
  Epetra_Vector tmp2(map);
  readA->Multiply(false, *readxexact, tmp1);

  A.Multiply(false, xexact, tmp2);
  double residual;
  if (verbose) cout << "Norm of Ax from file            = " << residual << endl;
  if (verbose) cout << "Norm of Ax after redistribution = " << residual << endl << endl << endl;

  //cout << "A from file = " << *readA << endl << endl << endl;

  //cout << "A after dist = " << A << endl << endl << endl;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;


  // Construct ILU preconditioner

  double elapsed_time, total_flops, MFLOPs;
  Epetra_Time timer(Comm);

  int LevelFill = 0;
  if (argc > 2)  LevelFill = atoi(argv[2]);
  if (verbose) cout << "Using Level Fill = " << LevelFill << endl;
  int Overlap = 0;
  if (argc > 3) Overlap = atoi(argv[3]);
  if (verbose) cout << "Using Level Overlap = " << Overlap << endl;
  double Athresh = 0.0;
  if (argc > 4) Athresh = atof(argv[4]);
  if (verbose) cout << "Using Absolute Threshold Value of = " << Athresh << endl;

  double Rthresh = 1.0;
  if (argc > 5) Rthresh = atof(argv[5]);
  if (verbose) cout << "Using Relative Threshold Value of = " << Rthresh << endl;

  Ifpack_IlukGraph * IlukGraph = 0;
  Ifpack_CrsRiluk * ILUK = 0;

  if (LevelFill>-1) {
    elapsed_time = timer.ElapsedTime();
    IlukGraph = new Ifpack_IlukGraph(A.Graph(), LevelFill, Overlap);
    elapsed_time = timer.ElapsedTime() - elapsed_time;
    if (verbose) cout << "Time to construct ILUK graph = " << elapsed_time << endl;

    Epetra_Flops fact_counter;
    elapsed_time = timer.ElapsedTime();
    ILUK = new Ifpack_CrsRiluk(*IlukGraph);
    int initerr = ILUK->InitValues(A);
    if (initerr!=0) cout << Comm << "InitValues error = " << initerr;
    elapsed_time = timer.ElapsedTime() - elapsed_time;
    total_flops = ILUK->Flops();
    MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Time to compute preconditioner values = " 
		    << elapsed_time << endl
		    << "MFLOPS for Factorization = " << MFLOPs << endl;
    //cout << *ILUK << endl;
  double Condest;
  ILUK->Condest(false, Condest);

  if (verbose) cout << "Condition number estimate for this preconditioner = " << Condest << endl;
  int Maxiter = 500;
  double Tolerance = 1.0E-14;

  Epetra_Vector xcomp(map);
  Epetra_Vector resid(map);

  Epetra_Flops counter;

  elapsed_time = timer.ElapsedTime();

  BiCGSTAB(A, xcomp, b, ILUK, Maxiter, Tolerance, &residual, verbose);

  elapsed_time = timer.ElapsedTime() - elapsed_time;
  total_flops = counter.Flops();
  MFLOPs = total_flops/elapsed_time/1000000.0;
  if (verbose) cout << "Time to compute solution = " 
		    << elapsed_time << endl
		    << "Number of operations in solve = " << total_flops << endl
		    << "MFLOPS for Solve = " << MFLOPs<< endl << endl;

  resid.Update(1.0, xcomp, -1.0, xexact, 0.0); // resid = xcomp - xexact


  if (verbose) cout << "Norm of the difference between exact and computed solutions = " << residual << endl;


  if (ILUK!=0) delete ILUK;
  if (IlukGraph!=0) delete IlukGraph;
  MPI_Finalize() ;

return 0 ;
/* Apply an identity matrix to the Schur complement operator. Drop the entries
   entries using a relative threshold. Assemble the result in a Crs Matrix
   which will be our approximate Schur complement.
Teuchos::RCP<Epetra_CrsMatrix> computeApproxSchur(shylu_config *config,
    shylu_symbolic *sym,
    Epetra_CrsMatrix *G, Epetra_CrsMatrix *R,
    Epetra_LinearProblem *LP, Amesos_BaseSolver *solver,
    Ifpack_Preconditioner *ifSolver, Epetra_CrsMatrix *C,
    Epetra_Map *localDRowMap)
    double relative_thres = config->relative_threshold;
    int nvectors = 16;

    ShyLU_Probing_Operator probeop(config, sym, G, R, LP, solver, ifSolver, C,
                                    localDRowMap, nvectors);

    // Get row map
    Epetra_Map rMap = G->RowMap();
    int *rows = rMap.MyGlobalElements();
    int totalElems = rMap.NumGlobalElements();
    int localElems = rMap.NumMyElements();
    //cout << " totalElems in Schur Complement" << totalElems << endl;
    //cout << myPID << " localElems" << localElems << endl;

    // **************** Two collectives here *********************
    Teuchos::Time ftime("setup time");
    int prefixSum;
    G->Comm().ScanSum(&localElems, &prefixSum, 1);
    //cout << " prefixSum" << prefixSum << endl;
    // Start the index in prefixSum-localElems
    int *mySGID = new int[totalElems];   // vector of size Schur complement !
    int *allSGID = new int[totalElems];   // vector of size Schur complement !
    int i, j;
    for (i = 0, j = 0; i < totalElems ; i++)
        if (i >= prefixSum - localElems && i < prefixSum)
            mySGID[i] = rows[j];
            mySGID[i] = 0;
        allSGID[i] = 0;

    C->Comm().SumAll(mySGID, allSGID, totalElems);

    cout << "Time to Compute RowIDS" << ftime.totalElapsedTime() << endl;
    // Now everyone knows the GIDs in the Schur complement

    //cout << rMap << endl;
    j = 0;
    Teuchos::RCP<Epetra_CrsMatrix> Sbar = Teuchos::rcp(new Epetra_CrsMatrix(
                                            Copy, rMap, localElems));
    int nentries;
    double *values = new double[localElems]; // Need to adjust this for more
    int *indices = new int[localElems];      // than one vector
    double *vecvalues;
    int dropped = 0;
    double *maxvalue = new double[nvectors];
    Teuchos::Time app_time("Apply time");
    int findex = totalElems / nvectors ;
    for (i = 0 ; i < findex*nvectors ; i+=nvectors)
        Epetra_MultiVector probevec(rMap, nvectors);
        Epetra_MultiVector Scol(rMap, nvectors);

        int cindex;
        for (int k = 0; k < nvectors; k++)
            cindex = k+i;
            if (cindex >= prefixSum - localElems && cindex < prefixSum)
                probevec.ReplaceGlobalValue(allSGID[cindex], k, 1.0);

        probeop.Apply(probevec, Scol);
        for (int k = 0; k < nvectors; k++) //TODO:Need to switch these loops
            cindex = k+i;
            vecvalues = Scol[k];
            //cout << "MAX" << maxvalue << endl;
            for (j = 0 ; j < localElems ; j++)
                nentries = 0; // inserting one entry in each row for now
                if (allSGID[cindex] == rows[j]) // diagonal entry
                    values[nentries] = vecvalues[j];
                    indices[nentries] = allSGID[cindex];
                    Sbar->InsertGlobalValues(rows[j], nentries, values, indices);
                else if (abs(vecvalues[j]/maxvalue[k]) > relative_thres)
                    values[nentries] = vecvalues[j];
                    indices[nentries] = allSGID[cindex];
                    Sbar->InsertGlobalValues(rows[j], nentries, values, indices);
                    if (vecvalues[j] != 0.0) dropped++;


    for ( ; i < totalElems ; i++)
        Epetra_MultiVector probevec(rMap, 1); // TODO: Try doing more than one
        Epetra_MultiVector Scol(rMap, 1);     // vector at a time

        if (i >= prefixSum - localElems && i < prefixSum)
            probevec.ReplaceGlobalValue(allSGID[i], 0, 1.0);

        probeop.Apply(probevec, Scol);
        vecvalues = Scol[0];
        //cout << "MAX" << maxvalue << endl;
        for (j = 0 ; j < localElems ; j++)
            nentries = 0; // inserting one entry in each row for now
            if (allSGID[i] == rows[j]) // diagonal entry
                values[nentries] = vecvalues[j];
                indices[nentries] = allSGID[i];
                Sbar->InsertGlobalValues(rows[j], nentries, values, indices);
            else if (abs(vecvalues[j]/maxvalue[0]) > relative_thres)
                values[nentries] = vecvalues[j];
                indices[nentries] = allSGID[i];
                Sbar->InsertGlobalValues(rows[j], nentries, values, indices);
                if (vecvalues[j] != 0.0) dropped++;
    cout << "Time in finding and dropping entries" << ftime.totalElapsedTime() << endl;
    cout << "Time in Apply of probing" << app_time.totalElapsedTime() << endl;
    cout << "#dropped entries" << dropped << endl;
    delete[] allSGID;
    delete[] mySGID;
    delete[] values;
    delete[] indices;
    delete[] maxvalue;

    return Sbar;
int CreateTridi(Epetra_CrsMatrix& A)

    Epetra_Map Map = A.RowMap();
    int NumMyElements = Map.NumMyElements();
    int NumGlobalElements = Map.NumGlobalElements();

    int * MyGlobalElements = new int[NumMyElements];

    // Add  rows one-at-a-time
    // Need some vectors to help
    // Off diagonal Values will always be -1

    double *Values = new double[3];
    int *Indices = new int[3];
    int NumEntries;

    for (int i=0; i<NumMyElements; i++)
        if (MyGlobalElements[i]==0)
            Indices[0] = 0;
            Indices[1] = 1;
            Values[0] = 2.0;
            Values[1] = -1.0;
            NumEntries = 2;
        else if (MyGlobalElements[i] == NumGlobalElements-1)
            Indices[0] = NumGlobalElements-1;
            Indices[1] = NumGlobalElements-2;
            Values[0] = 2.0;
            Values[1] = -1.0;
            NumEntries = 2;
            Indices[0] = MyGlobalElements[i]-1;
            Indices[1] = MyGlobalElements[i];
            Indices[2] = MyGlobalElements[i]+1;
            Values[0] = -1.0;
            Values[1] = 2.0;
            Values[2] = -1.0;
            NumEntries = 3;

        assert(A.InsertGlobalValues(MyGlobalElements[i], NumEntries, Values, Indices)==0);
        // Put in the diagonal entry
        //     assert(A.InsertGlobalValues(MyGlobalElements[i], 1, &two, &MyGlobalElements[i])==0);

    // Finish up

    delete[] MyGlobalElements;
    delete[] Values;
    delete[] Indices;
    return 0;
// *************************************************************
// main program - This benchmark code reads a Harwell-Boeing data
//                set and finds the minimal eigenvalue of the matrix
//                using inverse iteration.
// *************************************************************
int main(int argc, char *argv[]) {

  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  cout << Comm << endl;

  int MyPID = Comm.MyPID();

  bool verbose = false;
  if (MyPID==0) verbose = true; // Print out detailed results (turn off for best performance)

  if(argc != 2) {
    if (verbose) cerr << "Usage: " << argv[0] << " HB_data_file" << endl;
    exit(1); // Error

  // Define pointers that will be set by HB read function

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA;
  Epetra_Vector * readx;
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;

  // Call function to read in HB problem
  Trilinos_Util_ReadHb2Epetra(argv[1], Comm, readMap, readA, readx, readb, readxexact);

  // Not interested in x, b or xexact for an eigenvalue problem
  delete readx;
  delete readb;
  delete readxexact;

#ifdef EPETRA_MPI // If running in parallel, we need to distribute matrix across all PEs.

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);

  A.Export(*readA, exporter, Add);

  delete readA;
  delete readMap;

#else // If not running in parallel, we do not need to distribute the matrix
  Epetra_CrsMatrix & A = *readA;

  // Create flop counter to collect all FLOPS
  Epetra_Flops counter;

  double lambda = 0; // Minimal eigenvalue returned here
  // Call inverse iteration solver
  Epetra_Time timer(Comm);
  invIteration(A, lambda, verbose);
  double elapsedTime = timer.ElapsedTime();
  double totalFlops = counter.Flops();
  double MFLOPS = totalFlops/elapsedTime/1000000.0;

  cout << endl
       << "*************************************************" << endl
       << " Approximate smallest eigenvalue = " << lambda << endl
       << "    Total Time    = " << elapsedTime << endl
       << "    Total FLOPS   = " << totalFlops << endl
       << "    Total MFLOPS  = " << MFLOPS << endl
       << "*************************************************" << endl;

  // All done
  delete readA;
  delete readMap;

return (0);
LOCA::Epetra::AugmentedOp::buildExtendedMap(const Epetra_BlockMap& uMap,
					    Epetra_Map*& eMapPtr,
					    bool buildImporter,
					    bool haveParam)
  Epetra_BlockMap& nonconstUnderlyingMap = const_cast<Epetra_BlockMap&>(uMap);

  // Convert underlying map to point map if necessary
  Epetra_Map* uPointMapPtr = 
  bool allocatedPointMap = false;
  if (uPointMapPtr == NULL) {
    allocatedPointMap = true;
    blockMap2PointMap(uMap, uPointMapPtr);

  int max_gid = uPointMapPtr->MaxAllGID();
  int num_global_elements = uPointMapPtr->NumGlobalElements();
  int num_my_elements = uPointMapPtr->NumMyElements();
  int *global_elements = uPointMapPtr->MyGlobalElements();
  const Epetra_Comm& comm = uPointMapPtr->Comm();
  int index_base = uPointMapPtr->IndexBase();

  int ext_num_global_elements;
  int ext_num_my_elements;
  int *ext_global_elements;

  // Compute number of extended global elements
  if (buildImporter)
    ext_num_global_elements = 
      num_global_elements + numConstraints*comm.NumProc();
    ext_num_global_elements = num_global_elements + numConstraints;

  // Compute number of extended local elements
  if (buildImporter || haveParam)
     ext_num_my_elements = num_my_elements + numConstraints;
    ext_num_my_elements = num_my_elements;

  // Allocate extended global elements array
  ext_global_elements = new int[ext_num_my_elements];

  // Set extended global elements
  for (int i=0; i<num_my_elements; i++) {
    ext_global_elements[i] = global_elements[i];
  if (buildImporter || haveParam)
    for (int i=0; i<numConstraints; i++)
      ext_global_elements[num_my_elements+i] = max_gid + 1 + i;

  // Create extended point map
  eMapPtr = new Epetra_Map(ext_num_global_elements, ext_num_my_elements,
			   ext_global_elements, index_base, comm);

  // Free global elements array
  delete [] ext_global_elements;
  if (allocatedPointMap)
    delete uPointMapPtr;
int main(int argc, char *argv[]) {

  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  int MyPID = Comm.MyPID();

  bool verbose = true; 
  if (MyPID==0) verbose = true;

  if (verbose)
    cout << EpetraExt::EpetraExt_Version() << endl << endl;
  cout << Comm << endl;

  if(argc < 2 && verbose) {
    cerr << "Usage: " << argv[0] 
	 << " HB_filename" << endl;


  // Uncomment the next three lines to debug in mpi mode
  //int tmp;
  //if (MyPID==0) cin >> tmp;

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  // Call routine to read in HB problem
  Trilinos_Util_ReadHb2Epetra(argv[1], Comm, readMap, readA, readx, readb, readxexact);

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);
  Epetra_Vector x(map);
  Epetra_Vector b(map);
  Epetra_Vector xexact(map);

  Epetra_Time FillTimer(Comm);
  x.Export(*readx, exporter, Add);
  b.Export(*readb, exporter, Add);
  xexact.Export(*readxexact, exporter, Add);
  double vectorRedistributeTime = FillTimer.ElapsedTime();
  A.Export(*readA, exporter, Add);
  double matrixRedistributeTime = FillTimer.ElapsedTime() - vectorRedistributeTime;
  double fillCompleteTime = FillTimer.ElapsedTime() - matrixRedistributeTime;
  if (Comm.MyPID()==0)	{
    cout << "\n\n****************************************************" << endl;
    cout << "\n Vector redistribute  time (sec) = " << vectorRedistributeTime<< endl;
    cout << "    Matrix redistribute time (sec) = " << matrixRedistributeTime << endl;
    cout << "    Transform to Local  time (sec) = " << fillCompleteTime << endl<< endl;
  Epetra_Vector tmp1(*readMap);
  Epetra_Vector tmp2(map);
  readA->Multiply(false, *readxexact, tmp1);

  A.Multiply(false, xexact, tmp2);
  double residual;
  if (verbose) cout << "Norm of Ax from file            = " << residual << endl;
  if (verbose) cout << "Norm of Ax after redistribution = " << residual << endl << endl << endl;

  //cout << "A from file = " << *readA << endl << endl << endl;

  //cout << "A after dist = " << A << endl << endl << endl;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;


  EpetraExt::RowMatrixToMatrixMarketFile("test.mm", A, "test matrix", "This is a test matrix");
  MPI_Finalize() ;

return 0 ;
//  Amesos_TestMultiSolver.cpp reads in a matrix in Harwell-Boeing format, 
//  calls one of the sparse direct solvers, using blocked right hand sides
//  and computes the error and residual.  
//  TestSolver ignores the Harwell-Boeing right hand sides, creating
//  random right hand sides instead.  
//  Amesos_TestMultiSolver can test either A x = b or A^T x = b.
//  This can be a bit confusing because sparse direct solvers 
//  use compressed column storage - the transpose of Trilinos'
//  sparse row storage.
//  Matrices:
//    readA - Serial.  As read from the file.
//    transposeA - Serial.  The transpose of readA.
//    serialA - if (transpose) then transposeA else readA 
//    distributedA - readA distributed to all processes
//    passA - if ( distributed ) then distributedA else serialA
int Amesos_TestMultiSolver( Epetra_Comm &Comm, char *matrix_file, int numsolves, 
		      SparseSolverType SparseSolver, bool transpose,
		      int special, AMESOS_MatrixType matrix_type ) {

  int iam = Comm.MyPID() ;

  //  int hatever;
  //  if ( iam == 0 )  std::cin >> hatever ; 

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  std::string FileName = matrix_file ;
  int FN_Size = FileName.size() ; 
  std::string LastFiveBytes = FileName.substr( EPETRA_MAX(0,FN_Size-5), FN_Size );
  std::string LastFourBytes = FileName.substr( EPETRA_MAX(0,FN_Size-4), FN_Size );
  bool NonContiguousMap = false; 

  if ( LastFiveBytes == ".triU" ) { 
    NonContiguousMap = true; 
    // Call routine to read in unsymmetric Triplet matrix
    EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, false, Comm, readMap, readA, readx, 
						      readb, readxexact, NonContiguousMap ) );
  } else {
    if ( LastFiveBytes == ".triS" ) { 
      NonContiguousMap = true; 
      // Call routine to read in symmetric Triplet matrix
      EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, true, Comm, 
							readMap, readA, readx, 
							readb, readxexact, NonContiguousMap ) );
    } else {
      if (  LastFourBytes == ".mtx" ) { 
	EPETRA_CHK_ERR( Trilinos_Util_ReadMatrixMarket2Epetra( matrix_file, Comm, readMap, 
							       readA, readx, readb, readxexact) );
      } else {
	// Call routine to read in HB problem
	Trilinos_Util_ReadHb2Epetra( matrix_file, Comm, readMap, readA, readx, 
						     readb, readxexact) ;

  Epetra_CrsMatrix transposeA(Copy, *readMap, 0);
  Epetra_CrsMatrix *serialA ; 

  if ( transpose ) {
    assert( CrsMatrixTranspose( readA, &transposeA ) == 0 ); 
    serialA = &transposeA ; 
  } else {
    serialA = readA ; 

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);
  Epetra_Map* map_;

  if( NonContiguousMap ) {
    //  map gives us NumMyElements and MyFirstElement;
    int NumGlobalElements =  readMap->NumGlobalElements();
    int NumMyElements = map.NumMyElements();
    int MyFirstElement = map.MinMyGID();
    std::vector<int> MapMap_( NumGlobalElements );
    readMap->MyGlobalElements( &MapMap_[0] ) ;
    Comm.Broadcast( &MapMap_[0], NumGlobalElements, 0 ) ; 
    map_ = new Epetra_Map( NumGlobalElements, NumMyElements, &MapMap_[MyFirstElement], 0, Comm);
  } else {
    map_ = new Epetra_Map( map ) ; 

  // Create Exporter to distribute read-in matrix and vectors
  Epetra_Export exporter(*readMap, *map_);
  Epetra_CrsMatrix A(Copy, *map_, 0);

  Epetra_RowMatrix * passA = 0; 
  Epetra_MultiVector * passx = 0; 
  Epetra_MultiVector * passb = 0;
  Epetra_MultiVector * passxexact = 0;
  Epetra_MultiVector * passresid = 0;
  Epetra_MultiVector * passtmp = 0;

  Epetra_MultiVector x(*map_,numsolves);
  Epetra_MultiVector b(*map_,numsolves);
  Epetra_MultiVector xexact(*map_,numsolves);
  Epetra_MultiVector resid(*map_,numsolves);
  Epetra_MultiVector tmp(*map_,numsolves);

  Epetra_MultiVector serialx(*readMap,numsolves);
  Epetra_MultiVector serialb(*readMap,numsolves);
  Epetra_MultiVector serialxexact(*readMap,numsolves);
  Epetra_MultiVector serialresid(*readMap,numsolves);
  Epetra_MultiVector serialtmp(*readMap,numsolves);

  bool distribute_matrix = ( matrix_type == AMESOS_Distributed ) ; 
  if ( distribute_matrix ) { 
    //  Initialize x, b and xexact to the values read in from the file
    A.Export(*serialA, exporter, Add);


    passA = &A; 
    passx = &x; 
    passb = &b;
    passxexact = &xexact;
    passresid = &resid;
    passtmp = &tmp;
  } else { 
    passA = serialA; 
    passx = &serialx; 
    passb = &serialb;
    passxexact = &serialxexact;
    passresid = &serialresid;
    passtmp = &serialtmp;

  passxexact->SetSeed(131) ; 
  passx->SetSeed(11231) ; 

  passb->PutScalar( 0.0 );
  passA->Multiply( transpose, *passxexact, *passb ) ; 

  Epetra_MultiVector CopyB( *passb ) ;

  double Anorm = passA->NormInf() ; 
  SparseDirectTimingVars::SS_Result.Set_Anorm(Anorm) ;

  Epetra_LinearProblem Problem(  (Epetra_RowMatrix *) passA, 
				 (Epetra_MultiVector *) passx, 
				 (Epetra_MultiVector *) passb );

  double max_resid = 0.0;
  for ( int j = 0 ; j < special+1 ; j++ ) { 
    Epetra_Time TotalTime( Comm ) ; 
    if ( false ) { 

      unused code

    } else if ( SparseSolver == UMFPACK ) { 
      UmfpackOO umfpack( (Epetra_RowMatrix *) passA, 
			 (Epetra_MultiVector *) passx, 
			 (Epetra_MultiVector *) passb ) ; 
      umfpack.SetTrans( transpose ) ; 
      umfpack.Solve() ; 
    } else if ( SparseSolver == SuperLU ) { 
      SuperluserialOO superluserial( (Epetra_RowMatrix *) passA, 
				     (Epetra_MultiVector *) passx, 
				     (Epetra_MultiVector *) passb ) ; 

      superluserial.SetPermc( SuperLU_permc ) ; 
      superluserial.SetTrans( transpose ) ; 
      superluserial.SetUseDGSSV( special == 0 ) ; 
      superluserial.Solve() ; 
    } else if ( SparseSolver == SuperLUdist ) { 
      SuperludistOO superludist( Problem ) ; 
      superludist.SetTrans( transpose ) ; 
      EPETRA_CHK_ERR( superludist.Solve( true ) ) ;
    } else if ( SparseSolver == SuperLUdist2 ) { 
      Superludist2_OO superludist2( Problem ) ; 
      superludist2.SetTrans( transpose ) ; 
      EPETRA_CHK_ERR( superludist2.Solve( true ) ) ;
    } else if ( SparseSolver == SPOOLES ) { 
      SpoolesOO spooles( (Epetra_RowMatrix *) passA, 
			 (Epetra_MultiVector *) passx, 
			 (Epetra_MultiVector *) passb ) ; 
      spooles.SetTrans( transpose ) ; 
      spooles.Solve() ; 
    } else if ( SparseSolver == DSCPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Dscpack dscpack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( dscpack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( dscpack.Solve( ) ); 
    } else if ( SparseSolver == UMFPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Umfpack umfpack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( umfpack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( umfpack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( umfpack.Solve( ) ); 
    } else if ( SparseSolver == KLU ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Klu klu( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( klu.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( klu.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( klu.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( klu.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( klu.Solve( ) ); 
    } else if ( SparseSolver == PARAKLETE ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Paraklete paraklete( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( paraklete.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( paraklete.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( paraklete.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( paraklete.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( paraklete.Solve( ) ); 
    } else if ( SparseSolver == SuperLU ) { 
      Epetra_SLU superluserial( &Problem ) ; 
      EPETRA_CHK_ERR( superluserial.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( superluserial.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superluserial.NumericFactorization(  ) ); 

      EPETRA_CHK_ERR( superluserial.Solve( ) ); 
    } else if ( SparseSolver == LAPACK ) { 
      Teuchos::ParameterList ParamList ;
      ParamList.set( "MaxProcs", -3 );
      Amesos_Lapack lapack( Problem ) ; 
      EPETRA_CHK_ERR( lapack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( lapack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( lapack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( lapack.Solve( ) ); 
    } else if ( SparseSolver == TAUCS ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Taucs taucs( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( taucs.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( taucs.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( taucs.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( taucs.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( taucs.Solve( ) ); 
    } else if ( SparseSolver == PARDISO ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Pardiso pardiso( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( pardiso.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( pardiso.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( pardiso.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( pardiso.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( pardiso.Solve( ) ); 
    } else if ( SparseSolver == PARKLETE ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Parklete parklete( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( parklete.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( parklete.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( parklete.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( parklete.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( parklete.Solve( ) ); 
    } else if ( SparseSolver == MUMPS ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Mumps mumps( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( mumps.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( mumps.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( mumps.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( mumps.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( mumps.Solve( ) ); 
    } else if ( SparseSolver == SCALAPACK ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Scalapack scalapack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( scalapack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( scalapack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( scalapack.SymbolicFactorization( ) ); 
      EPETRA_CHK_ERR( scalapack.NumericFactorization( ) ); 
      EPETRA_CHK_ERR( scalapack.Solve( ) ); 
    } else if ( SparseSolver == SUPERLUDIST ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Superludist superludist( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( superludist.SetParameters( ParamList ) ); 

      EPETRA_CHK_ERR( superludist.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( superludist.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superludist.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( superludist.Solve( ) ); 
    } else if ( SparseSolver == SUPERLU ) { 
      Teuchos::ParameterList ParamList ;
      Amesos_Superlu superlu( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( superlu.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( superlu.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( superlu.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( superlu.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( superlu.Solve( ) ); 
    } else if ( SparseSolver == SPOOLESSERIAL ) { 
      SpoolesserialOO spoolesserial( (Epetra_RowMatrix *) passA, 
				     (Epetra_MultiVector *) passx, 
				     (Epetra_MultiVector *) passb ) ; 
      spoolesserial.Solve() ;
    } else { 
      SparseDirectTimingVars::log_file << "Solver not implemented yet" << std::endl ;
      std::cerr << "\n\n####################  Requested solver not available (Or not tested with blocked RHS) on this platform #####################\n" << std::endl ;

    SparseDirectTimingVars::SS_Result.Set_Total_Time( TotalTime.ElapsedTime() ); 
    //    SparseDirectTimingVars::SS_Result.Set_First_Time( 0.0 ); 
    //    SparseDirectTimingVars::SS_Result.Set_Middle_Time( 0.0 ); 
    //    SparseDirectTimingVars::SS_Result.Set_Last_Time( 0.0 ); 

    //  Compute the error = norm(xcomp - xexact )
    std::vector <double> error(numsolves) ; 
    double max_error = 0.0;
    passresid->Update(1.0, *passx, -1.0, *passxexact, 0.0);

    for ( int i = 0 ; i< numsolves; i++ ) 
      if ( error[i] > max_error ) max_error = error[i] ; 
    SparseDirectTimingVars::SS_Result.Set_Error(max_error) ;

    //  passxexact->Norm2(&error[0] ) ; 
    //  passx->Norm2(&error ) ; 

    //  Compute the residual = norm(Ax - b)
    std::vector <double> residual(numsolves) ; 
    passA->Multiply( transpose, *passx, *passtmp);
    passresid->Update(1.0, *passtmp, -1.0, *passb, 0.0); 
    //    passresid->Update(1.0, *passtmp, -1.0, CopyB, 0.0); 

    for ( int i = 0 ; i< numsolves; i++ ) 
      if ( residual[i] > max_resid ) max_resid = residual[i] ; 

    SparseDirectTimingVars::SS_Result.Set_Residual(max_resid) ;
    std::vector <double> bnorm(numsolves); 
    passb->Norm2( &bnorm[0] ) ; 
    SparseDirectTimingVars::SS_Result.Set_Bnorm(bnorm[0]) ;

    std::vector <double> xnorm(numsolves); 
    passx->Norm2( &xnorm[0] ) ; 
    SparseDirectTimingVars::SS_Result.Set_Xnorm(xnorm[0]) ;

    if ( false && iam == 0 ) { 

      std::cout << " Amesos_TestMutliSolver.cpp " << std::endl ; 
      for ( int i = 0 ; i< numsolves && i < 10 ; i++ ) {
	std::cout << "i=" << i 
	     << " error = " << error[i] 
	     << " xnorm = " << xnorm[i] 
	     << " residual = " << residual[i] 
	     << " bnorm = " << bnorm[i] 
	     << std::endl ; 
      std::cout << std::endl << " max_resid = " << max_resid ; 
      std::cout << " max_error = " << max_error << std::endl ; 
      std::cout << " Get_residual() again = " << SparseDirectTimingVars::SS_Result.Get_Residual() << std::endl ;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;
  delete map_;

return 0 ;
// ============================================================================
void EpetraExt::XMLWriter::
Write(const std::string& Label, const Epetra_Map& Map)
  TEUCHOS_TEST_FOR_EXCEPTION(IsOpen_ == false, std::logic_error,
                     "No file has been opened");

  int NumGlobalElements = Map.NumGlobalElements();
  int* MyGlobalElements = Map.MyGlobalElements();

  if (Comm_.MyPID() == 0)
    std::ofstream of(FileName_.c_str(), std::ios::app);

    of << "<Map Label=\"" << Label 
      << "\" NumElements=\"" << NumGlobalElements << '"'
      << " IndexBase=\"" << Map.IndexBase() << '"'
      << " NumProc=\"" << Comm_.NumProc() << '"';


  for (int iproc = 0; iproc < Comm_.NumProc(); ++iproc)
    if (iproc == Comm_.MyPID())
      std::ofstream of(FileName_.c_str(), std::ios::app);

      of << " ElementsOnProc" << iproc << "=\"" << Map.NumMyElements() << '"';

  if (Comm_.MyPID() == 0)
    std::ofstream of(FileName_.c_str(), std::ios::app);
    of << '>' << std::endl;

  for (int iproc = 0; iproc < Comm_.NumProc(); iproc++)
    if (iproc == Comm_.MyPID())
      std::ofstream of(FileName_.c_str(), std::ios::app);

      of << "<Proc ID=\"" << Comm_.MyPID() << "\">" << std::endl;

      for (int i = 0; i < Map.NumMyElements(); ++i)
        of << MyGlobalElements[i] << std::endl;

      of << "</Proc>" << std::endl;

  if (Comm_.MyPID() == 0)
    std::ofstream of(FileName_.c_str(), std::ios::app);
    of << "</Map>" << std::endl;
//  Diagonal:  0=no change, 1=eliminate entry
//             from the map for the largest row element in process 0
//             2=add diagonal entries to the matrix, with a zero value 
//             (assume row map contains all diagonal entries). 
//  ReindexRowMap:  
//    0=no change, 1= add 2 (still contiguous), 2=non-contiguous
//  ReindexColMap
//    0=same as RowMap, 1=add 4 - Different From RowMap, but contiguous) 
//  RangeMap:
//    0=no change, 1=serial map, 2=bizarre distribution, 3=replicated map
//  DomainMap:
//    0=no change, 1=serial map, 2=bizarre distribution, 3=replicated map
RCP<Epetra_CrsMatrix> NewMatNewMap(Epetra_CrsMatrix& In, 
					   int Diagonal, 
					   int ReindexRowMap,
					   int ReindexColMap,
					   int RangeMapType,
					   int DomainMapType

  //  If we are making no change, return the original matrix (which has a linear map) 
#if 0
  std::cout << __FILE__ << "::" << __LINE__ << " " 
       << Diagonal << " " 
       << ReindexRowMap << " " 
       << ReindexColMap << " " 
       << RangeMapType << " " 
       << DomainMapType << " " << std::endl ; 

  if ( Diagonal + ReindexRowMap + ReindexColMap + RangeMapType + DomainMapType == 0 ) {
    RCP<Epetra_CrsMatrix> ReturnOrig = rcp( &In, false );
    return ReturnOrig ;

  //  Diagonal==2 is used for a different purpose - 
  //    Making sure that the diagonal of the matrix is non-empty.
  //  Note:  The diagonal must exist in In.RowMap().
  if ( Diagonal == 2 ) { 
    assert( ReindexRowMap==0 && ReindexColMap == 0 ) ; 

  int (*RowPermute)(int in) = 0;
  int (*ColPermute)(int in) = 0;

  assert( Diagonal >= 0  && Diagonal <= 2 ); 
  assert( ReindexRowMap>=0 && ReindexRowMap<=2 );
  assert( ReindexColMap>=0 && ReindexColMap<=1 );
  assert( RangeMapType>=0 && RangeMapType<=3 );
  assert( DomainMapType>=0 && DomainMapType<=3 );

  Epetra_Map DomainMap = In.DomainMap();
  Epetra_Map RangeMap = In.RangeMap();
  Epetra_Map ColMap = In.ColMap();
  Epetra_Map RowMap = In.RowMap();
  int NumMyRowElements = RowMap.NumMyElements();
  int NumMyColElements = ColMap.NumMyElements();
  int NumMyRangeElements = RangeMap.NumMyElements();
  int NumMyDomainElements = DomainMap.NumMyElements();

  int NumGlobalRowElements = RowMap.NumGlobalElements();
  int NumGlobalColElements = ColMap.NumGlobalElements();
  int NumGlobalRangeElements = RangeMap.NumGlobalElements();
  int NumGlobalDomainElements = DomainMap.NumGlobalElements();
  assert( NumGlobalRangeElements == NumGlobalDomainElements ) ; 

  std::vector<int> MyGlobalRowElements( NumMyRowElements ) ; 
  std::vector<int> NumEntriesPerRow( NumMyRowElements ) ; 
  std::vector<int> MyPermutedGlobalRowElements( NumMyRowElements ) ; 
  std::vector<int> MyGlobalColElements( NumMyColElements ) ; 
  std::vector<int> MyPermutedGlobalColElements( NumMyColElements ) ; // Used to create the column map
  std::vector<int> MyPermutedGlobalColElementTable( NumMyColElements ) ; // To convert local indices to global
  std::vector<int> MyGlobalRangeElements( NumMyRangeElements ) ; 
  std::vector<int> MyPermutedGlobalRangeElements( NumMyRangeElements ) ; 
  std::vector<int> MyGlobalDomainElements( NumMyDomainElements ) ; 
  std::vector<int> MyPermutedGlobalDomainElements( NumMyDomainElements ) ; 

  switch( ReindexRowMap ) {
  case 0:
    RowPermute = &NoPermute ;
  case 1:
    RowPermute = &SmallRowPermute ;
  case 2:
    RowPermute = BigRowPermute ;
  switch( ReindexColMap ) {
  case 0:
    ColPermute = RowPermute ;
  case 1:
    ColPermute = &SmallColPermute ;

  //  Create Serial Range and Domain Maps based on the permuted indexing
  int nlocal = 0;
  if (In.Comm().MyPID()==0) nlocal = NumGlobalRangeElements;
  std::vector<int> AllIDs( NumGlobalRangeElements ) ; 
  for ( int i = 0; i < NumGlobalRangeElements ; i++ ) AllIDs[i] = (*RowPermute)( i ) ; 
  Epetra_Map SerialRangeMap( -1, nlocal, &AllIDs[0], 0, In.Comm()); 
  std::vector<int> AllIDBs( NumGlobalRangeElements ) ; 
  for ( int i = 0; i < NumGlobalRangeElements ; i++ ) AllIDBs[i] = (*ColPermute)( i ) ; 
  Epetra_Map SerialDomainMap( -1, nlocal, &AllIDBs[0], 0, In.Comm()); 

  //  Create Bizarre Range and Domain Maps based on the permuted indexing
  //  These are nearly serial, having all but one element on process 0
  //  The goal here is to make sure that we can use Domain and Range maps 
  //  that are neither serial, nor distributed in the normal manner.
  std::vector<int> AllIDCs( NumGlobalRangeElements ) ; 
  for ( int i = 0; i < NumGlobalRangeElements ; i++ ) AllIDCs[i] = (*ColPermute)( i ) ; 
  if ( In.Comm().NumProc() > 1 ) { 
    if (In.Comm().MyPID()==0) nlocal = NumGlobalRangeElements-1;
    if (In.Comm().MyPID()==1) {
      nlocal = 1;
      AllIDCs[0] = (*ColPermute)( NumGlobalRangeElements - 1 );
  int iam = In.Comm().MyPID();
  Epetra_Map BizarreDomainMap( -1, nlocal, &AllIDCs[0], 0, In.Comm()); 

  std::vector<int> AllIDDs( NumGlobalRangeElements ) ; 
  for ( int i = 0; i < NumGlobalRangeElements ; i++ ) AllIDDs[i] = (*RowPermute)( i ) ; 
  if ( In.Comm().NumProc() > 1 ) { 
    if (In.Comm().MyPID()==0) nlocal = NumGlobalRangeElements-1;
    if (In.Comm().MyPID()==1) {
      nlocal = 1;
      AllIDDs[0] = (*RowPermute)( NumGlobalRangeElements -1 ) ;
  Epetra_Map BizarreRangeMap( -1, nlocal, &AllIDDs[0], 0, In.Comm()); 

  //  Compute the column map 
  //  If Diagonal==1, remove the column corresponding to the last row owned 
  //  by process 0.  Removing this column from a tridiagonal matrix, leaves
  //  a disconnected, but non-singular matrix.  
  int NumMyColElementsOut = 0 ; 
  int NumGlobalColElementsOut ; 
  if ( Diagonal == 1 ) 
    NumGlobalColElementsOut = NumGlobalColElements-1; 
    NumGlobalColElementsOut = NumGlobalColElements; 
  if ( Diagonal == 1 && iam==0 ) { 
    for ( int i=0; i < NumMyColElements  ; i++ ) {
      if ( MyGlobalColElements[i] != MyGlobalRowElements[NumMyRowElements-1] ) {
	MyPermutedGlobalColElements[NumMyColElementsOut++] = 
	  (*ColPermute)( MyGlobalColElements[i] ) ; 
    assert( NumMyColElementsOut == NumMyColElements-1 );
  } else {
    for ( int i=0; i < NumMyColElements  ; i++ )  
      MyPermutedGlobalColElements[i] = 
	(*ColPermute)( MyGlobalColElements[i] ) ; 
    NumMyColElementsOut = NumMyColElements ; 
    if ( Diagonal == 2 ) {
      //  For each row, make sure that the column map has this row in it, 
      //    if it doesn't, add it to the column map.  
      //  Note:  MyPermutedGlobalColElements == MyGlobalColElements when 
      //  Diagonal==2 because  ( Diagonal == 2 ) implies:
      //     ReindexRowMap==0 && ReindexColMap == 0  - see assert above
      for ( int i=0; i < NumMyRowElements  ; i++ ) {
	bool MissingDiagonal = true; 
	for ( int j=0; j < NumMyColElements; j++ ) { 
	  if ( MyGlobalRowElements[i] == MyGlobalColElements[j] ) {
	    MissingDiagonal = false; 
	if ( MissingDiagonal ) {
	  MyPermutedGlobalColElements[NumMyColElementsOut] = MyGlobalRowElements[i];

  //  These tables are used both as the permutation tables and to create the maps.
  for ( int i=0; i < NumMyColElements  ; i++ ) 
    MyPermutedGlobalColElementTable[i] = 
      (*ColPermute)( MyGlobalColElements[i] ) ; 
  for ( int i=0; i < NumMyRowElements  ; i++ ) 
    MyPermutedGlobalRowElements[i] = 
      (*RowPermute)( MyGlobalRowElements[i] ) ; 
  for ( int i=0; i < NumMyRangeElements  ; i++ ) 
    MyPermutedGlobalRangeElements[i] = 
      (*RowPermute)( MyGlobalRangeElements[i] ) ; 
  for ( int i=0; i < NumMyDomainElements  ; i++ ) 
    MyPermutedGlobalDomainElements[i] = 
      (*ColPermute)( MyGlobalDomainElements[i] ) ; 

  RCP<Epetra_Map> PermutedRowMap = 
    rcp( new Epetra_Map( NumGlobalRowElements, NumMyRowElements, 
			 &MyPermutedGlobalRowElements[0], 0, In.Comm() ) ); 
  RCP<Epetra_Map> PermutedColMap = 
    rcp( new Epetra_Map( NumGlobalColElementsOut, NumMyColElementsOut, 
			 &MyPermutedGlobalColElements[0], 0, In.Comm() ) ); 
  RCP<Epetra_Map> PermutedRangeMap = 
    rcp( new Epetra_Map( NumGlobalRangeElements, NumMyRangeElements, 
			 &MyPermutedGlobalRangeElements[0], 0, In.Comm() ) ); 
  RCP<Epetra_Map> PermutedDomainMap = 
    rcp( new Epetra_Map( NumGlobalDomainElements, NumMyDomainElements, 
			 &MyPermutedGlobalDomainElements[0], 0, In.Comm() ) ); 
  //  These vectors are filled and then passed to InsertGlobalValues 
  std::vector<int> ThisRowIndices( In.MaxNumEntries() );
  std::vector<double> ThisRowValues( In.MaxNumEntries() );
  std::vector<int> PermutedGlobalColIndices( In.MaxNumEntries() );

  //std::cout << __FILE__ << "::" <<__LINE__ << std::endl ; 
  RCP<Epetra_CrsMatrix> Out = 
    rcp( new Epetra_CrsMatrix( Copy, *PermutedRowMap, *PermutedColMap, 0 ) );

  for (int i=0; i<NumMyRowElements; i++)

      int NumIndicesThisRow = 0;
      assert( In.ExtractMyRowCopy( i, 
				   &ThisRowIndices[0] ) == 0 ) ;
      for (int j = 0 ; j < NumIndicesThisRow ; j++ )
	  PermutedGlobalColIndices[j] = MyPermutedGlobalColElementTable[ ThisRowIndices[j] ]  ;
      bool MissingDiagonal = false; 
      if ( Diagonal==2 ) { 
	assert( MyGlobalRowElements[i] == MyPermutedGlobalRowElements[i] );
	MissingDiagonal = true; 
	for( int j =0 ; j < NumIndicesThisRow ; j++ ) {
	  if ( PermutedGlobalColIndices[j] == MyPermutedGlobalRowElements[i] ) {
	    MissingDiagonal = false ; 
#if 0
	std::cout  << __FILE__ << "::" << __LINE__ 
	      << " i = " << i 
	      << " MyPermutedGlobalRowElements[i]  = " << MyPermutedGlobalRowElements[i] 
	      <<   " MissingDiagonal = " << MissingDiagonal << std::endl ; 

      if ( MissingDiagonal ) { 
	ThisRowValues.resize(NumIndicesThisRow+1) ; 
	ThisRowValues[NumIndicesThisRow] = 0.0;
	PermutedGlobalColIndices[NumIndicesThisRow] = MyPermutedGlobalRowElements[i] ;
#if 0
	std::cout  << __FILE__ << "::" << __LINE__ 
	      << " i = " << i 
	      << "NumIndicesThisRow = " << NumIndicesThisRow 
	      << "ThisRowValues[NumIndicesThisRow = " << ThisRowValues[NumIndicesThisRow] 
	      << " PermutedGlobalColIndices[NumIndcesThisRow] = " << PermutedGlobalColIndices[NumIndicesThisRow] 
	      << std::endl ; 

	NumIndicesThisRow++  ;

      assert( Out->InsertGlobalValues( MyPermutedGlobalRowElements[i], 
				       &PermutedGlobalColIndices[0] ) >= 0 ); 


  Epetra_LocalMap ReplicatedMap( NumGlobalRangeElements, 0, In.Comm() );

  RCP<Epetra_Map> OutRangeMap ;
  RCP<Epetra_Map> OutDomainMap ;
  switch( RangeMapType ) {
  case 0:
    OutRangeMap = PermutedRangeMap ;
  case 1:
    OutRangeMap = rcp(&SerialRangeMap, false); 
  case 2:
    OutRangeMap = rcp(&BizarreRangeMap, false); 
  case 3:
    OutRangeMap = rcp(&ReplicatedMap, false); 
  //  switch( DomainMapType ) {
  switch( DomainMapType ) {
  case 0:
    OutDomainMap = PermutedDomainMap ;
  case 1:
    OutDomainMap = rcp(&SerialDomainMap, false); 
  case 2:
    OutDomainMap = rcp(&BizarreDomainMap, false); 
  case 3:
    OutDomainMap = rcp(&ReplicatedMap, false); 
#if 0
  assert(Out->FillComplete( *PermutedDomainMap, *PermutedRangeMap )==0);
  assert(Out->FillComplete( *OutDomainMap, *OutRangeMap )==0);

#if 0
  std::cout << __FILE__ << "::" << __LINE__ << std::endl ;
  Out->Print( std::cout ) ; 

  return Out;
int main(int argc, char *argv[]) {

  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  cout << Comm << endl;

  int MyPID = Comm.MyPID();

  bool verbose = false;
  bool verbose1 = true;
  if (MyPID==0) verbose = true;

  if(argc < 2 && verbose) {
    cerr << "Usage: " << argv[0] 
	 << " HB_filename [level_fill [level_overlap [absolute_threshold [ relative_threshold]]]]" << endl
	 << "where:" << endl
	 << "HB_filename        - filename and path of a Harwell-Boeing data set" << endl
	 << "level_fill         - The amount of fill to use for ILU(k) preconditioner (default 0)" << endl
	 << "level_overlap      - The amount of overlap used for overlapping Schwarz subdomains (default 0)" << endl
	 << "absolute_threshold - The minimum value to place on the diagonal prior to factorization (default 0.0)" << endl
	 << "relative_threshold - The relative amount to perturb the diagonal prior to factorization (default 1.0)" << endl << endl
	 << "To specify a non-default value for one of these parameters, you must specify all" << endl
	 << " preceding values but not any subsequent parameters. Example:" << endl
	 << "ifpackHpcSerialMsr.exe mymatrix.hpc 1  - loads mymatrix.hpc, uses level fill of one, all other values are defaults" << endl
	 << endl;


  // Uncomment the next three lines to debug in mpi mode
  //int tmp;
  //if (MyPID==0) cin >> tmp;

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  // Call routine to read in HB problem
  Trilinos_Util_ReadHb2Epetra(argv[1], Comm, readMap, readA, readx, readb, readxexact);

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);
  Epetra_Vector x(map);
  Epetra_Vector b(map);
  Epetra_Vector xexact(map);

  Epetra_Time FillTimer(Comm);
  x.Export(*readx, exporter, Add);
  b.Export(*readb, exporter, Add);
  xexact.Export(*readxexact, exporter, Add);
  double vectorRedistributeTime = FillTimer.ElapsedTime();
  A.Export(*readA, exporter, Add);
  double matrixRedistributeTime = FillTimer.ElapsedTime() - vectorRedistributeTime;
  double fillCompleteTime = FillTimer.ElapsedTime() - matrixRedistributeTime;
  if (Comm.MyPID()==0)	{
    cout << "\n\n****************************************************" << endl;
    cout << "\n Vector redistribute  time (sec) = " << vectorRedistributeTime<< endl;
    cout << "    Matrix redistribute time (sec) = " << matrixRedistributeTime << endl;
    cout << "    Transform to Local  time (sec) = " << fillCompleteTime << endl<< endl;
  Epetra_Vector tmp1(*readMap);
  Epetra_Vector tmp2(map);
  readA->Multiply(false, *readxexact, tmp1);

  A.Multiply(false, xexact, tmp2);
  double residual;
  if (verbose) cout << "Norm of Ax from file            = " << residual << endl;
  if (verbose) cout << "Norm of Ax after redistribution = " << residual << endl << endl << endl;

  //cout << "A from file = " << *readA << endl << endl << endl;

  //cout << "A after dist = " << A << endl << endl << endl;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;


  bool smallProblem = false;
  if (A.RowMap().NumGlobalElements()<100) smallProblem = true;

  if (smallProblem)
    cout << "Original Matrix = " << endl << A   << endl;


  Epetra_LinearProblem FullProblem(&A, &x, &b);
  double normb, norma;
  norma = A.NormInf();
  if (verbose)
    cout << "Inf norm of Original Matrix = " << norma << endl
	 << "Inf norm of Original RHS    = " << normb << endl;
  Epetra_Time ReductionTimer(Comm);
  Epetra_CrsSingletonFilter SingletonFilter;
  double reduceInitTime = ReductionTimer.ElapsedTime();
  double reduceAnalyzeTime = ReductionTimer.ElapsedTime() - reduceInitTime;

  if (SingletonFilter.SingletonsDetected())
    cout << "Singletons found" << endl;
  else {
    cout << "Singletons not found" << endl;
  double reduceConstructTime = ReductionTimer.ElapsedTime() - reduceInitTime;

  double totalReduceTime = ReductionTimer.ElapsedTime();

  if (verbose)
    cout << "\n\n****************************************************" << endl
	 << "    Reduction init  time (sec)           = " << reduceInitTime<< endl
	 << "    Reduction Analyze time (sec)         = " << reduceAnalyzeTime << endl
	 << "    Construct Reduced Problem time (sec) = " << reduceConstructTime << endl
	 << "    Reduction Total time (sec)           = " << totalReduceTime << endl<< endl;


  Epetra_LinearProblem * ReducedProblem = SingletonFilter.ReducedProblem();

  Epetra_CrsMatrix * Ap = dynamic_cast<Epetra_CrsMatrix *>(ReducedProblem->GetMatrix());
  Epetra_Vector * bp = (*ReducedProblem->GetRHS())(0);
  Epetra_Vector * xp = (*ReducedProblem->GetLHS())(0);

  if (smallProblem)
    cout << " Reduced Matrix = " << endl << *Ap << endl
	 << " LHS before sol = " << endl << *xp << endl
	 << " RHS            = " << endl << *bp << endl;

  // Construct ILU preconditioner

  double elapsed_time, total_flops, MFLOPs;
  Epetra_Time timer(Comm);

  int LevelFill = 0;
  if (argc > 2)  LevelFill = atoi(argv[2]);
  if (verbose) cout << "Using Level Fill = " << LevelFill << endl;
  int Overlap = 0;
  if (argc > 3) Overlap = atoi(argv[3]);
  if (verbose) cout << "Using Level Overlap = " << Overlap << endl;
  double Athresh = 0.0;
  if (argc > 4) Athresh = atof(argv[4]);
  if (verbose) cout << "Using Absolute Threshold Value of = " << Athresh << endl;

  double Rthresh = 1.0;
  if (argc > 5) Rthresh = atof(argv[5]);
  if (verbose) cout << "Using Relative Threshold Value of = " << Rthresh << endl;

  Ifpack_IlukGraph * IlukGraph = 0;
  Ifpack_CrsRiluk * ILUK = 0;

  if (LevelFill>-1) {
    elapsed_time = timer.ElapsedTime();
    IlukGraph = new Ifpack_IlukGraph(Ap->Graph(), LevelFill, Overlap);
    elapsed_time = timer.ElapsedTime() - elapsed_time;
    if (verbose) cout << "Time to construct ILUK graph = " << elapsed_time << endl;

    Epetra_Flops fact_counter;
    elapsed_time = timer.ElapsedTime();
    ILUK = new Ifpack_CrsRiluk(*IlukGraph);
    int initerr = ILUK->InitValues(*Ap);
    if (initerr!=0) {
      cout << endl << Comm << endl << "  InitValues error = " << initerr;
      if (initerr==1) cout << "  Zero diagonal found, warning error only";
      cout << endl << endl;
    elapsed_time = timer.ElapsedTime() - elapsed_time;
    total_flops = ILUK->Flops();
    MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Time to compute preconditioner values = " 
		    << elapsed_time << endl
		    << "MFLOPS for Factorization = " << MFLOPs << endl;
    //cout << *ILUK << endl;
  double Condest;
  ILUK->Condest(false, Condest);

  if (verbose) cout << "Condition number estimate for this preconditioner = " << Condest << endl;
  int Maxiter = 100;
  double Tolerance = 1.0E-8;

  Epetra_Flops counter;
  if (ILUK!=0) ILUK->SetFlopCounter(*Ap);

  elapsed_time = timer.ElapsedTime();

  double normreducedb, normreduceda;
  normreduceda = Ap->NormInf();
  if (verbose) 
    cout << "Inf norm of Reduced Matrix = " << normreduceda << endl
	 << "Inf norm of Reduced RHS    = " << normreducedb << endl;

  BiCGSTAB(*Ap, *xp, *bp, ILUK, Maxiter, Tolerance, &residual, verbose);

  elapsed_time = timer.ElapsedTime() - elapsed_time;
  total_flops = counter.Flops();
  MFLOPs = total_flops/elapsed_time/1000000.0;
  if (verbose) cout << "Time to compute solution = " 
		    << elapsed_time << endl
		    << "Number of operations in solve = " << total_flops << endl
		    << "MFLOPS for Solve = " << MFLOPs<< endl << endl;


  if (smallProblem)
  cout << " Reduced LHS after sol = " << endl << *xp << endl
       << " Full    LHS after sol = " << endl << x << endl
       << " Full  Exact LHS         = " << endl << xexact << endl;

  Epetra_Vector resid(x);

  resid.Update(1.0, x, -1.0, xexact, 0.0); // resid = xcomp - xexact

  double normx, normxexact;

  if (verbose) 
    cout << "2-norm of computed solution                               = " << normx << endl
	 << "2-norm of exact solution                                  = " << normxexact << endl
	 << "2-norm of difference between computed and exact solution  = " << residual << endl;
  if (verbose1 && residual>1.0e-5) {
    if (verbose)
      cout << "Difference between computed and exact solution appears large..." << endl
	   << "Computing norm of A times this difference.  If this norm is small, then matrix is singular"
	   << endl;
    Epetra_Vector bdiff(b);
    assert(A.Multiply(false, resid, bdiff)==0);
    if (verbose) 
      cout << "2-norm of A times difference between computed and exact solution  = " << residual << endl;
  if (verbose) 
    cout << "********************************************************" << endl
	 << "              Solving again with 2*Ax=2*b" << endl
	 << "********************************************************" << endl;

  A.Scale(1.0); // A = 2*A
  b.Scale(1.0); // b = 2*b
  norma = A.NormInf();
  if (verbose)
    cout << "Inf norm of Original Matrix = " << norma << endl
	 << "Inf norm of Original RHS    = " << normb << endl;
  double updateReducedProblemTime = ReductionTimer.ElapsedTime();
  updateReducedProblemTime = ReductionTimer.ElapsedTime() - updateReducedProblemTime;
  if (verbose)
    cout << "\n\n****************************************************" << endl
	 << "    Update Reduced Problem time (sec)           = " << updateReducedProblemTime<< endl
	 << "****************************************************" << endl;

  if (LevelFill>-1) {

    Epetra_Flops fact_counter;
    elapsed_time = timer.ElapsedTime();

    int initerr = ILUK->InitValues(*Ap);
    if (initerr!=0) {
      cout << endl << Comm << endl << "  InitValues error = " << initerr;
      if (initerr==1) cout << "  Zero diagonal found, warning error only";
      cout << endl << endl;
    elapsed_time = timer.ElapsedTime() - elapsed_time;
    total_flops = ILUK->Flops();
    MFLOPs = total_flops/elapsed_time/1000000.0;
    if (verbose) cout << "Time to compute preconditioner values = " 
		    << elapsed_time << endl
		    << "MFLOPS for Factorization = " << MFLOPs << endl;
    double Condest;
    ILUK->Condest(false, Condest);
    if (verbose) cout << "Condition number estimate for this preconditioner = " << Condest << endl;
  normreduceda = Ap->NormInf();
  if (verbose) 
    cout << "Inf norm of Reduced Matrix = " << normreduceda << endl
	 << "Inf norm of Reduced RHS    = " << normreducedb << endl;

  BiCGSTAB(*Ap, *xp, *bp, ILUK, Maxiter, Tolerance, &residual, verbose);

  elapsed_time = timer.ElapsedTime() - elapsed_time;
  total_flops = counter.Flops();
  MFLOPs = total_flops/elapsed_time/1000000.0;
  if (verbose) cout << "Time to compute solution = " 
		    << elapsed_time << endl
		    << "Number of operations in solve = " << total_flops << endl
		    << "MFLOPS for Solve = " << MFLOPs<< endl << endl;


  if (smallProblem)
  cout << " Reduced LHS after sol = " << endl << *xp << endl
       << " Full    LHS after sol = " << endl << x << endl
       << " Full  Exact LHS         = " << endl << xexact << endl;

  resid.Update(1.0, x, -1.0, xexact, 0.0); // resid = xcomp - xexact


  if (verbose) 
    cout << "2-norm of computed solution                               = " << normx << endl
	 << "2-norm of exact solution                                  = " << normxexact << endl
	 << "2-norm of difference between computed and exact solution  = " << residual << endl;
  if (verbose1 && residual>1.0e-5) {
    if (verbose)
      cout << "Difference between computed and exact solution appears large..." << endl
	   << "Computing norm of A times this difference.  If this norm is small, then matrix is singular"
	   << endl;
    Epetra_Vector bdiff(b);
    assert(A.Multiply(false, resid, bdiff)==0);
    if (verbose) 
      cout << "2-norm of A times difference between computed and exact solution  = " << residual << endl;

  if (ILUK!=0) delete ILUK;
  if (IlukGraph!=0) delete IlukGraph;
  MPI_Finalize() ;

return 0 ;
int main(int argc, char *argv[])
  int ierr = 0;
  double elapsed_time;
  double total_flops;
  double MFLOPs;


  // Initialize MPI
  Epetra_MpiComm comm( MPI_COMM_WORLD );
  Epetra_SerialComm comm;

  bool verbose = false;
  bool summary = false;

  // Check if we should print verbose results to standard out
  if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='v') verbose = true;

  // Check if we should print verbose results to standard out
  if (argc>6) if (argv[6][0]=='-' && argv[6][1]=='s') summary = true;

  if(argc < 6) {
    cerr << "Usage: " << argv[0]
         << " NumNodesX NumNodesY NumProcX NumProcY NumPoints [-v|-s]" << endl
         << "where:" << endl
         << "NumNodesX         - Number of mesh nodes in X direction per processor" << endl
         << "NumNodesY         - Number of mesh nodes in Y direction per processor" << endl
         << "NumProcX          - Number of processors to use in X direction" << endl
         << "NumProcY          - Number of processors to use in Y direction" << endl
         << "NumPoints         - Number of points to use in stencil (5, 9 or 25 only)" << endl
         << "-v|-s             - (Optional) Run in verbose mode if -v present or summary mode if -s present" << endl
         << " NOTES: NumProcX*NumProcY must equal the number of processors used to run the problem." << endl << endl
	 << " Serial example:" << endl
         << argv[0] << " 16 12 1 1 25 -v" << endl
	 << " Run this program in verbose mode on 1 processor using a 16 X 12 grid with a 25 point stencil."<< endl <<endl
	 << " MPI example:" << endl
         << "mpirun -np 32 " << argv[0] << " 10 12 4 8 9 -v" << endl
	 << " Run this program in verbose mode on 32 processors putting a 10 X 12 subgrid on each processor using 4 processors "<< endl
	 << " in the X direction and 8 in the Y direction.  Total grid size is 40 points in X and 96 in Y with a 9 point stencil."<< endl
         << endl;

    //char tmp;
    //if (comm.MyPID()==0) cout << "Press any key to continue..."<< endl;
    //if (comm.MyPID()==0) cin >> tmp;

  comm.SetTracebackMode(0); // This should shut down any error traceback reporting
  if (verbose && comm.MyPID()==0)
    cout << Epetra_Version() << endl << endl;
  if (summary && comm.MyPID()==0) {
    if (comm.NumProc()==1)
      cout << Epetra_Version() << endl << endl;
      cout << endl << endl; // Print two blank line to keep output columns lined up

  if (verbose) cout << comm <<endl;

  // Redefine verbose to only print on PE 0

  if (verbose && comm.MyPID()!=0) verbose = false;
  if (summary && comm.MyPID()!=0) summary = false;

  int numNodesX = atoi(argv[1]);
  int numNodesY = atoi(argv[2]);
  int numProcsX = atoi(argv[3]);
  int numProcsY = atoi(argv[4]);
  int numPoints = atoi(argv[5]);

  if (verbose || (summary && comm.NumProc()==1)) {
    cout << " Number of local nodes in X direction  = " << numNodesX << endl
	 << " Number of local nodes in Y direction  = " << numNodesY << endl
	 << " Number of global nodes in X direction = " << numNodesX*numProcsX << endl
	 << " Number of global nodes in Y direction = " << numNodesY*numProcsY << endl
	 << " Number of local nonzero entries       = " << numNodesX*numNodesY*numPoints << endl
	 << " Number of global nonzero entries      = " << numNodesX*numNodesY*numPoints*numProcsX*numProcsY << endl
	 << " Number of Processors in X direction   = " << numProcsX << endl
	 << " Number of Processors in Y direction   = " << numProcsY << endl
	 << " Number of Points in stencil           = " << numPoints << endl << endl;
  // Print blank line to keep output columns lined up
  if (summary && comm.NumProc()>1)
    cout << endl << endl << endl << endl << endl << endl << endl << endl<< endl << endl;

  if (numProcsX*numProcsY!=comm.NumProc()) {
    cerr << "Number of processors = " << comm.NumProc() << endl
	 << " is not the product of " << numProcsX << " and " << numProcsY << endl << endl;

  if (numPoints!=5 && numPoints!=9 && numPoints!=25) {
    cerr << "Number of points specified = " << numPoints << endl
	 << " is not 5, 9, 25" << endl << endl;

  if (numNodesX*numNodesY<=0) {
    cerr << "Product of number of nodes is <= zero" << endl << endl;

  Epetra_IntSerialDenseVector Xoff, XLoff, XUoff;
  Epetra_IntSerialDenseVector Yoff, YLoff, YUoff;
  if (numPoints==5) {

     // Generate a 5-point 2D Finite Difference matrix
    Xoff[0] = -1; Xoff[1] = 1; Xoff[2] = 0; Xoff[3] = 0;  Xoff[4] = 0; 
    Yoff[0] = 0;  Yoff[1] = 0; Yoff[2] = 0; Yoff[3] = -1; Yoff[4] = 1; 

     // Generate a 2-point 2D Lower triangular Finite Difference matrix
    XLoff[0] = -1; XLoff[1] =  0; 
    YLoff[0] =  0; YLoff[1] = -1;

     // Generate a 3-point 2D upper triangular Finite Difference matrix
    XUoff[0] =  0; XUoff[1] =  1; XUoff[2] = 0; 
    YUoff[0] =  0; YUoff[1] =  0; YUoff[2] = 1;
  else if (numPoints==9) {
    // Generate a 9-point 2D Finite Difference matrix
    Xoff[0] = -1;  Xoff[1] =  0; Xoff[2] =  1; 
    Yoff[0] = -1;  Yoff[1] = -1; Yoff[2] = -1; 
    Xoff[3] = -1;  Xoff[4] =  0; Xoff[5] =  1; 
    Yoff[3] =  0;  Yoff[4] =  0; Yoff[5] =  0; 
    Xoff[6] = -1;  Xoff[7] =  0; Xoff[8] =  1; 
    Yoff[6] =  1;  Yoff[7] =  1; Yoff[8] =  1; 

    // Generate a 5-point lower triangular 2D Finite Difference matrix
    XLoff[0] = -1;  XLoff[1] =  0; Xoff[2] =  1; 
    YLoff[0] = -1;  YLoff[1] = -1; Yoff[2] = -1; 
    XLoff[3] = -1;  XLoff[4] =  0; 
    YLoff[3] =  0;  YLoff[4] =  0;

    // Generate a 4-point upper triangular 2D Finite Difference matrix
    XUoff[0] =  1; 
    YUoff[0] =  0; 
    XUoff[1] = -1;  XUoff[2] =  0; XUoff[3] =  1; 
    YUoff[1] =  1;  YUoff[2] =  1; YUoff[3] =  1; 

  else {
    // Generate a 25-point 2D Finite Difference matrix
    int xi = 0, yi = 0;
    int xo = -2, yo = -2;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; 
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; 
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; 
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; 
    xo = -2, yo++;
    Xoff[xi++] = xo++;  Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++; Xoff[xi++] = xo++;
    Yoff[yi++] = yo  ;  Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; Yoff[yi++] = yo  ; 

    // Generate a 13-point lower triangular 2D Finite Difference matrix
    xi = 0, yi = 0;
    xo = -2, yo = -2;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; 
    xo = -2, yo++;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; YLoff[yi++] = yo  ; 
    xo = -2, yo++;
    XLoff[xi++] = xo++;  XLoff[xi++] = xo++; XLoff[xi++] = xo++;
    YLoff[yi++] = yo  ;  YLoff[yi++] = yo  ; YLoff[yi++] = yo  ;

    // Generate a 13-point upper triangular 2D Finite Difference matrix
    xi = 0, yi = 0;
    xo = 0, yo = 0;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; 
    xo = -2, yo++;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; 
    xo = -2, yo++;
    XUoff[xi++] = xo++;  XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++; XUoff[xi++] = xo++;
    YUoff[yi++] = yo  ;  YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; YUoff[yi++] = yo  ; 


  Epetra_Map * map;
  Epetra_Map * mapL;
  Epetra_Map * mapU;
  Epetra_CrsMatrix * A;
  Epetra_CrsMatrix * L;
  Epetra_CrsMatrix * U;
  Epetra_MultiVector * b;
  Epetra_MultiVector * bt;
  Epetra_MultiVector * xexact;
  Epetra_MultiVector * bL;
  Epetra_MultiVector * btL;
  Epetra_MultiVector * xexactL;
  Epetra_MultiVector * bU;
  Epetra_MultiVector * btU;
  Epetra_MultiVector * xexactU;
  Epetra_SerialDenseVector resvec(0);

  Epetra_Flops flopcounter;
  Epetra_Time timer(comm);

  int jstop = 1;
  int jstop = 1;
  int jstop = 2;
  for (int j=0; j<jstop; j++) {
    for (int k=1; k<17; k++) {
      if (k<3 || (k%4==0 && k<9)) {
      if (k<6 || k%4==0) {
      if (k<7 || k%2==0) {
      int nrhs=k;
      if (verbose) cout << "\n*************** Results for " << nrhs << " RHS with ";

      bool StaticProfile = (j!=0);
      if (verbose) 
	if (StaticProfile) cout << " static profile\n";
	else cout << " dynamic profile\n";
      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints,
			 Xoff.Values(), Yoff.Values(), nrhs, comm, verbose, summary,
			 map, A, b, bt, xexact, StaticProfile, false);

      Epetra_JadMatrix JA(*A);
      elapsed_time = timer.ElapsedTime();
      if (verbose) cout << "Time to create Jagged diagonal matrix = " << elapsed_time << endl;

      //cout << "A = " << *A << endl;
      //cout << "JA = " << JA << endl;

      runJadMatrixTests(&JA, b, bt, xexact, StaticProfile, verbose, summary);

      runMatrixTests(A, b, bt, xexact, StaticProfile, verbose, summary);

      delete A;
      delete b;
      delete bt; 
      delete xexact;

      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XLoff.Length(),
			 XLoff.Values(), YLoff.Values(), nrhs, comm, verbose, summary,
			 mapL, L, bL, btL, xexactL, StaticProfile, true);

      GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, XUoff.Length(),
			 XUoff.Values(), YUoff.Values(), nrhs, comm, verbose, summary,
			 mapU, U, bU, btU, xexactU, StaticProfile, true);

      runLUMatrixTests(L, bL, btL, xexactL, U, bU, btU, xexactU, StaticProfile, verbose, summary);

      delete L;
      delete bL;
      delete btL; 
      delete xexactL;
      delete mapL;

      delete U;
      delete bU;
      delete btU; 
      delete xexactU;
      delete mapU;

      Epetra_MultiVector q(*map, nrhs);
      Epetra_MultiVector z(q);
      Epetra_MultiVector r(q);
      delete map;


      //10 norms
      for( int i = 0; i < 10; ++i )
	q.Norm2( resvec.Values() );

      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "\nTotal MFLOPs for 10 Norm2's= " << MFLOPs << endl;
      if (summary) {
	if (comm.NumProc()==1) cout << "Norm2" << '\t';
	cout << MFLOPs << endl;
      //10 dot's
      for( int i = 0; i < 10; ++i )
	q.Dot(z, resvec.Values());
      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "Total MFLOPs for 10 Dot's  = " << MFLOPs << endl;
      if (summary) {
	if (comm.NumProc()==1) cout << "DotProd" << '\t';
	cout << MFLOPs << endl;
      //10 dot's
      for( int i = 0; i < 10; ++i )
	q.Update(1.0, z, 1.0, r, 0.0);
      elapsed_time = timer.ElapsedTime();
      total_flops = q.Flops();
      MFLOPs = total_flops/elapsed_time/1000000.0;
      if (verbose) cout << "Total MFLOPs for 10 Updates= " << MFLOPs << endl;
      if (summary) {
	if (comm.NumProc()==1) cout << "Update" << '\t';
	cout << MFLOPs << endl;
  MPI_Finalize() ;

return ierr ;

// Constructs a 2D PDE finite difference matrix using the list of x and y offsets.
// nx      (In) - number of grid points in x direction
// ny      (In) - number of grid points in y direction
//   The total number of equations will be nx*ny ordered such that the x direction changes
//   most rapidly: 
//      First equation is at point (0,0)
//      Second at                  (1,0)
//       ...
//      nx equation at             (nx-1,0)
//      nx+1st equation at         (0,1)

// numPoints (In) - number of points in finite difference stencil
// xoff    (In) - stencil offsets in x direction (of length numPoints)
// yoff    (In) - stencil offsets in y direction (of length numPoints)
//   A standard 5-point finite difference stencil would be described as:
//     numPoints = 5
//     xoff = [-1, 1, 0,  0, 0]
//     yoff = [ 0, 0, 0, -1, 1]

// nrhs - Number of rhs to generate. (First interface produces vectors, so nrhs is not needed

// comm    (In) - an Epetra_Comm object describing the parallel machine (numProcs and my proc ID)
// map    (Out) - Epetra_Map describing distribution of matrix and vectors/multivectors
// A      (Out) - Epetra_CrsMatrix constructed for nx by ny grid using prescribed stencil
//                Off-diagonal values are random between 0 and 1.  If diagonal is part of stencil,
//                diagonal will be slightly diag dominant.
// b      (Out) - Generated RHS.  Values satisfy b = A*xexact
// bt     (Out) - Generated RHS.  Values satisfy b = A'*xexact
// xexact (Out) - Generated exact solution to Ax = b and b' = A'xexact

// Note: Caller of this function is responsible for deleting all output objects.

void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints, 
			int * xoff, int * yoff,
			const Epetra_Comm  &comm, bool verbose, bool summary, 
			Epetra_Map *& map, 
			Epetra_CrsMatrix *& A, 
			Epetra_Vector *& b, 
			Epetra_Vector *& bt,
			Epetra_Vector *&xexact, bool StaticProfile, bool MakeLocalOnly) {

  Epetra_MultiVector * b1, * bt1, * xexact1;
  GenerateCrsProblem(numNodesX, numNodesY, numProcsX, numProcsY, numPoints, 
		     xoff, yoff, 1, comm, verbose, summary, 
		     map, A, b1, bt1, xexact1, StaticProfile, MakeLocalOnly);

  b = dynamic_cast<Epetra_Vector *>(b1);
  bt = dynamic_cast<Epetra_Vector *>(bt1);
  xexact = dynamic_cast<Epetra_Vector *>(xexact1);


void GenerateCrsProblem(int numNodesX, int numNodesY, int numProcsX, int numProcsY, int numPoints, 
			int * xoff, int * yoff, int nrhs,
			const Epetra_Comm  &comm, bool verbose, bool summary,
			Epetra_Map *& map, 
			Epetra_CrsMatrix *& A, 
			Epetra_MultiVector *& b, 
			Epetra_MultiVector *& bt,
			Epetra_MultiVector *&xexact, bool StaticProfile, bool MakeLocalOnly) {
  Epetra_Time timer(comm);
  // Determine my global IDs
  int * myGlobalElements;
  GenerateMyGlobalElements(numNodesX, numNodesY, numProcsX, numProcsY, comm.MyPID(), myGlobalElements);

  int numMyEquations = numNodesX*numNodesY;
  map = new Epetra_Map(-1, numMyEquations, myGlobalElements, 0, comm); // Create map with 2D block partitioning.
  delete [] myGlobalElements;

  int numGlobalEquations = map->NumGlobalElements();

  int profile = 0; if (StaticProfile) profile = numPoints;


  if (MakeLocalOnly) 
    A = new Epetra_CrsMatrix(Copy, *map, *map, profile, StaticProfile); // Construct matrix with rowmap=colmap
    A = new Epetra_CrsMatrix(Copy, *map, profile, StaticProfile); // Construct matrix


  if (MakeLocalOnly) 
    A = new Epetra_CrsMatrix(Copy, *map, *map, profile); // Construct matrix with rowmap=colmap
    A = new Epetra_CrsMatrix(Copy, *map, profile); // Construct matrix


  int * indices = new int[numPoints];
  double * values = new double[numPoints];

  double dnumPoints = (double) numPoints;
  int nx = numNodesX*numProcsX;

  for (int i=0; i<numMyEquations; i++) {

    int rowID = map->GID(i);
    int numIndices = 0;

    for (int j=0; j<numPoints; j++) {
      int colID = rowID + xoff[j] + nx*yoff[j]; // Compute column ID based on stencil offsets
      if (colID>-1 && colID<numGlobalEquations) {
	indices[numIndices] = colID;
	double value = - ((double) rand())/ ((double) RAND_MAX);
	if (colID==rowID)
	  values[numIndices++] = dnumPoints - value; // Make diagonal dominant
	  values[numIndices++] = value;
    //cout << "Building row " << rowID << endl;
    A->InsertGlobalValues(rowID, numIndices, values, indices);

  delete [] indices;
  delete [] values;
  double insertTime = timer.ElapsedTime();
  double fillCompleteTime = timer.ElapsedTime();

  if (verbose)
    cout << "Time to insert matrix values = " << insertTime << endl
	 << "Time to complete fill        = " << fillCompleteTime << endl;
  if (summary) {
    if (comm.NumProc()==1) cout << "InsertTime" << '\t';
    cout << insertTime << endl;
    if (comm.NumProc()==1) cout << "FillCompleteTime" << '\t';
    cout << fillCompleteTime << endl;

  if (nrhs<=1) {  
    b = new Epetra_Vector(*map);
    bt = new Epetra_Vector(*map);
    xexact = new Epetra_Vector(*map);
  else {
    b = new Epetra_MultiVector(*map, nrhs);
    bt = new Epetra_MultiVector(*map, nrhs);
    xexact = new Epetra_MultiVector(*map, nrhs);

  xexact->Random(); // Fill xexact with random values

  A->Multiply(false, *xexact, *b);
  A->Multiply(true, *xexact, *bt);

// build maps to make other conversions
void buildSubMaps(const Epetra_Map & globalMap,const std::vector<int> & vars,const Epetra_Comm & comm,
                  std::vector<std::pair<int,Teuchos::RCP<Epetra_Map> > > & subMaps)
int MultiVectorTests(const Epetra_Map & Map, int NumVectors, bool verbose)
  const Epetra_Comm & Comm = Map.Comm();
  int ierr = 0, i, j;
  /* get number of processors and the name of this processor */
  int MyPID   = Comm.MyPID();
  // Construct FEVbrMatrix
  if (verbose && MyPID==0) cout << "constructing Epetra_FEVbrMatrix" << endl;

  //we'll set up a tri-diagonal matrix.

  int numGlobalRows = Map.NumGlobalElements();
  int minLocalRow = Map.MinMyGID();
  int rowLengths = 3;

  Epetra_FEVbrMatrix A(Copy, Map, rowLengths);
  if (verbose && MyPID==0) {
    cout << "calling A.InsertGlobalValues with 1-D data array"<<endl;

  int numCols = 3;
  int* ptIndices = new int[numCols];
  for(int k=0; k<numCols; ++k) {
    ptIndices[k] = minLocalRow+k;

  double* values_1d = new double[numCols*numCols];
  for(j=0; j<numCols*numCols; ++j) {
    values_1d[j] = 3.0;

  //For an extreme test, we'll have all processors sum into all rows.

  int minGID = Map.MinAllGID();

  //For now we're going to assume that there's just one point associated with
  //each GID (element).

  double* ptCoefs = new double[3];

  {for(i=0; i<numGlobalRows; ++i) {
    if (i>0 && i<numGlobalRows-1) {
      ptIndices[0] = minGID+i-1;
      ptIndices[1] = minGID+i;
      ptIndices[2] = minGID+i+1;
      ptCoefs[0] = -1.0;
      ptCoefs[1] = 2.0;
      ptCoefs[2] = -1.0;
      numCols = 3;
    else if (i == 0) {
      ptIndices[0] = minGID+i;
      ptIndices[1] = minGID+i+1;
      ptIndices[2] = minGID+i+2;
      ptCoefs[0] = 2.0;
      ptCoefs[1] = -1.0;
      ptCoefs[2] = -1.0;
      numCols = 3;
    else {
      ptIndices[0] = minGID+i-2;
      ptIndices[1] = minGID+i-1;
      ptIndices[2] = minGID+i;
      ptCoefs[0] = -1.0;
      ptCoefs[1] = -1.0;
      ptCoefs[2] = 2.0;
      numCols = 3;

    int row = minGID+i;

    EPETRA_TEST_ERR( A.BeginInsertGlobalValues(row, rowLengths, ptIndices), ierr);

    for(j=0; j<rowLengths; ++j) {
      EPETRA_TEST_ERR( A.SubmitBlockEntry(&(ptCoefs[j]), 1, 1, 1), ierr);

    EPETRA_TEST_ERR( A.EndSubmitEntries(), ierr);


  if (verbose&&MyPID==0) {
    cout << "calling A.GlobalAssemble()" << endl;

  EPETRA_TEST_ERR( A.GlobalAssemble(), ierr );

  if (verbose&&MyPID==0) {
  cout << "after globalAssemble"<<endl;
  if (verbose) {

  delete [] values_1d;
  delete [] ptIndices;
  delete [] ptCoefs;

int main(int argc, char *argv[]) {

#ifdef HAVE_MPI
  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  int MyPID = Comm.MyPID();
  bool verbose = false; 
  if (MyPID==0) verbose = true;

  // matrix downloaded from MatrixMarket
  char FileName[] = "../HBMatrices/fidap005.rua";

  Epetra_Map * readMap; // Pointers because of Trilinos_Util_ReadHb2Epetra
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  // Call routine to read in HB problem
  Trilinos_Util_ReadHb2Epetra(FileName, Comm, readMap, readA, readx, 
			      readb, readxexact);

  int NumGlobalElements = readMap->NumGlobalElements();

  // Create uniform distributed map
  Epetra_Map map(NumGlobalElements, 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);
  Epetra_Vector x(map);
  Epetra_Vector b(map);
  Epetra_Vector xexact(map);

  Epetra_Time FillTimer(Comm);
  A.Export(*readA, exporter, Add);
  x.Export(*readx, exporter, Add);
  b.Export(*readb, exporter, Add);
  xexact.Export(*readxexact, exporter, Add);

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;
  // ============================ //
  // Construct ILU preconditioner //
  // ---------------------------- //

  //  modify those parameters 
  int    LevelFill = 1;
  double DropTol = 0.0;
  double Condest;
  Ifpack_CrsIct * ICT = NULL;
  ICT = new Ifpack_CrsIct(A,DropTol,LevelFill);
  // Init values from A
  // compute the factors
  // and now estimate the condition number
  cout << Condest << endl;
  if( Comm.MyPID() == 0 ) {
    cout << "Condition number estimate (level-of-fill = "
	 << LevelFill <<  ") = " << Condest << endl;

  // Define label for printing out during the solve phase
  string label = "Ifpack_CrsIct Preconditioner: LevelFill = " + toString(LevelFill) + 
                                                 " Overlap = 0"; 
  // Here we create an AztecOO object
  AztecOO solver;
  // Here we set the IFPACK preconditioner and specify few parameters

  int Niters = 1200;
  solver.SetAztecOption(AZ_kspace, Niters);
  solver.SetAztecOption(AZ_output, 20); 
  solver.Iterate(Niters, 5.0e-5);

  if (ICT!=0) delete ICT;
#ifdef HAVE_MPI
  MPI_Finalize() ;

return 0 ;
int checkmap(Epetra_Map & Map, int NumGlobalElements, int NumMyElements, 
	     int *MyGlobalElements, int IndexBase, Epetra_Comm& Comm,
	     bool DistributedGlobal)
  int i, ierr=0, forierr = 0;



  int *MyElementSizeList = new int[NumMyElements];


  forierr = 0;
  for (i=0; i<NumMyElements; i++) forierr += MyElementSizeList[i]!=1;

  delete [] MyElementSizeList;

  const Epetra_Comm & Comm1 = Map.Comm();




  EPETRA_TEST_ERR(!Map.LinearMap() && MyGlobalElements==0,ierr);

  EPETRA_TEST_ERR(Map.LinearMap() && MyGlobalElements!=0,ierr);



  int MaxLID = Map.MaxLID();

  int MaxMyGID = (Comm.MyPID()+1)*NumMyElements-1+IndexBase;
  if (Comm.MyPID()>2) MaxMyGID+=3;
  if (!DistributedGlobal) MaxMyGID = NumMyElements-1+IndexBase;




  int MinMyGID = Comm.MyPID()*NumMyElements+IndexBase;
  if (Comm.MyPID()>2) MinMyGID+=3;
  if (!DistributedGlobal) MinMyGID = 0;
  int * MyGlobalElements1 = new int[NumMyElements];

  forierr = 0;
  if (MyGlobalElements==0)
      for (i=0; i<NumMyElements; i++) 
	forierr += MyGlobalElements1[i]!=MinMyGID+i;
  else {
    for (i=0; i<NumMyElements; i++)
      forierr += MyGlobalElements[i]!=MyGlobalElements1[i];


  int MaxMyGID2 = Map.GID(Map.LID(MaxMyGID));
  EPETRA_TEST_ERR(MaxMyGID2 != MaxMyGID,ierr);
  int MaxLID2 = Map.LID(Map.GID(MaxLID));
  EPETRA_TEST_ERR(MaxLID2 != MaxLID,ierr);

  EPETRA_TEST_ERR(Map.GID(MaxLID+1) != IndexBase-1,ierr);// MaxLID+1 doesn't exist
  EPETRA_TEST_ERR(Map.LID(MaxMyGID+1) != -1,ierr);// MaxMyGID+1 doesn't exist or is on a different processor





  // Check RemoteIDList function
  // Get some GIDs off of each processor to test
  int TotalNumEle, NumElePerProc, NumProc = Comm.NumProc();
  int MinNumEleOnProc;
  int NumMyEle=Map.NumMyElements();
  if (MinNumEleOnProc > 5) NumElePerProc = 6;
  else NumElePerProc = MinNumEleOnProc;
  if (NumElePerProc > 0) {
    TotalNumEle = NumElePerProc*NumProc;
    int * MyGIDlist = new int[NumElePerProc];
    int * GIDlist = new int[TotalNumEle];
    int * PIDlist = new int[TotalNumEle];
    int * LIDlist = new int[TotalNumEle];
    for (i=0; i<NumElePerProc; i++)
	  MyGIDlist[i] = MyGlobalElements1[i];
    Comm.GatherAll(MyGIDlist,GIDlist,NumElePerProc);// Get a few values from each proc
    Map.RemoteIDList(TotalNumEle, GIDlist, PIDlist, LIDlist);
    int MyPID= Comm.MyPID();

    forierr = 0;
    for (i=0; i<TotalNumEle; i++) {
      if (Map.MyGID(GIDlist[i])) {
	forierr += PIDlist[i] != MyPID;
	forierr += !Map.MyLID(Map.LID(GIDlist[i])) || Map.LID(GIDlist[i]) != LIDlist[i] || Map.GID(LIDlist[i]) != GIDlist[i];
      else {
	forierr += PIDlist[i] == MyPID; // If MyGID comes back false, the PID listed should be that of another proc

    delete [] MyGIDlist;
    delete [] GIDlist;
    delete [] PIDlist;
    delete [] LIDlist;

  delete [] MyGlobalElements1;

  // Check RemoteIDList function (assumes all maps are linear, even if not stored that way)

  if (Map.LinearMap()) {

    int * GIDList = new int[3];
    int * PIDList = new int[3];
    int * LIDList = new int[3];
    int MyPID = Map.Comm().MyPID();
    int NumIDs = 0;
    //GIDList[NumIDs++] = Map.MaxAllGID()+1; // Should return -1 for both PID and LID
    if (Map.MinMyGID()-1>=Map.MinAllGID()) GIDList[NumIDs++] = Map.MinMyGID()-1;
    if (Map.MaxMyGID()+1<=Map.MaxAllGID()) GIDList[NumIDs++] = Map.MaxMyGID()+1;

    Map.RemoteIDList(NumIDs, GIDList, PIDList, LIDList);

    NumIDs = 0;


    if (Map.MinMyGID()-1>=Map.MinAllGID()) EPETRA_TEST_ERR(!(PIDList[NumIDs++]==MyPID-1),ierr);
    if (Map.MaxMyGID()+1<=Map.MaxAllGID()) EPETRA_TEST_ERR(!(PIDList[NumIDs]==MyPID+1),ierr);
    if (Map.MaxMyGID()+1<=Map.MaxAllGID()) EPETRA_TEST_ERR(!(LIDList[NumIDs++]==0),ierr);

    delete [] GIDList;
    delete [] PIDList;
    delete [] LIDList;

  return (ierr);
int main(int argc, char *argv[]) {

#ifdef HAVE_MPI
  Epetra_MpiComm Comm (MPI_COMM_WORLD);
  Epetra_SerialComm Comm;

  int MyPID = Comm.MyPID();

  // matrix downloaded from MatrixMarket
  char FileName[] = "../HBMatrices/fidap005.rua";

  Epetra_Map * readMap; // Pointers because of Trilinos_Util_ReadHb2Epetra
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  // Call routine to read in HB problem
  Trilinos_Util_ReadHb2Epetra(FileName, Comm, readMap, readA, readx, 
			      readb, readxexact);

  int NumGlobalElements = readMap->NumGlobalElements();

  // Create uniform distributed map
  Epetra_Map map(NumGlobalElements, 0, Comm);

  // Create Exporter to distribute read-in matrix and vectors

  Epetra_Export exporter(*readMap, map);
  Epetra_CrsMatrix A(Copy, map, 0);
  Epetra_Vector x(map);
  Epetra_Vector b(map);
  Epetra_Vector xexact(map);

  Epetra_Time FillTimer(Comm);
  x.Export(*readx, exporter, Add);
  b.Export(*readb, exporter, Add);
  xexact.Export(*readxexact, exporter, Add);
  double vectorRedistributeTime = FillTimer.ElapsedTime();
  A.Export(*readA, exporter, Add);
  double matrixRedistributeTime = FillTimer.ElapsedTime() - vectorRedistributeTime;
  double fillCompleteTime = FillTimer.ElapsedTime() - matrixRedistributeTime;

  if( MyPID==0 ) {
    cout << "Vector redistribute  time (sec) = "
	 << vectorRedistributeTime<< endl;
    cout << "Matrix redistribute time (sec) = "
	 << matrixRedistributeTime << endl;
    cout << "Transform to Local  time (sec) = "
	 << fillCompleteTime << endl<< endl;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;

#ifdef HAVE_MPI
  MPI_Finalize() ;

int Amesos_TestSolver( Epetra_Comm &Comm, char *matrix_file, 
		       SparseSolverType SparseSolver,
		       bool transpose, 
		       int special, AMESOS_MatrixType matrix_type ) {

  Epetra_Map * readMap;
  Epetra_CrsMatrix * readA; 
  Epetra_Vector * readx; 
  Epetra_Vector * readb;
  Epetra_Vector * readxexact;
  std::string FileName = matrix_file ;
  int FN_Size = FileName.size() ; 
  std::string LastFiveBytes = FileName.substr( EPETRA_MAX(0,FN_Size-5), FN_Size );
  std::string LastFourBytes = FileName.substr( EPETRA_MAX(0,FN_Size-4), FN_Size );
  bool NonContiguousMap = false; 

  if ( LastFiveBytes == ".triU" ) { 
    // Call routine to read in unsymmetric Triplet matrix
    NonContiguousMap = true; 
    EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, false, Comm, readMap, readA, readx, 
						      readb, readxexact, NonContiguousMap ) );
  } else {
    if ( LastFiveBytes == ".triS" ) { 
      NonContiguousMap = true; 
      // Call routine to read in symmetric Triplet matrix
      EPETRA_CHK_ERR( Trilinos_Util_ReadTriples2Epetra( matrix_file, true, Comm, readMap, readA, readx, 
							readb, readxexact, NonContiguousMap ) );
    } else {
      if (  LastFourBytes == ".mtx" ) { 
	EPETRA_CHK_ERR( Trilinos_Util_ReadMatrixMarket2Epetra( matrix_file, Comm, readMap, 
							       readA, readx, readb, readxexact) );
      } else {
	// Call routine to read in HB problem
	Trilinos_Util_ReadHb2Epetra( matrix_file, Comm, readMap, readA, readx, 
						     readb, readxexact) ;

  Epetra_CrsMatrix transposeA(Copy, *readMap, 0);
  Epetra_CrsMatrix *serialA ; 

  if ( transpose ) {
    assert( CrsMatrixTranspose( readA, &transposeA ) == 0 ); 
    serialA = &transposeA ; 
  } else {
    serialA = readA ; 

  Epetra_RowMatrix * passA = 0; 
  Epetra_Vector * passx = 0; 
  Epetra_Vector * passb = 0;
  Epetra_Vector * passxexact = 0;
  Epetra_Vector * passresid = 0;
  Epetra_Vector * passtmp = 0;

  // Create uniform distributed map
  Epetra_Map map(readMap->NumGlobalElements(), 0, Comm);
  Epetra_Map* map_;

  if( NonContiguousMap ) {
    //  map gives us NumMyElements and MyFirstElement;
    int NumGlobalElements =  readMap->NumGlobalElements();
    int NumMyElements = map.NumMyElements();
    int MyFirstElement = map.MinMyGID();
    std::vector<int> MapMap_( NumGlobalElements );
    readMap->MyGlobalElements( &MapMap_[0] ) ;
    Comm.Broadcast( &MapMap_[0], NumGlobalElements, 0 ) ; 
    map_ = new Epetra_Map( NumGlobalElements, NumMyElements, &MapMap_[MyFirstElement], 0, Comm);
  } else {
    map_ = new Epetra_Map( map ) ; 

  Epetra_CrsMatrix A(Copy, *map_, 0);

  const Epetra_Map &OriginalMap = serialA->RowMatrixRowMap() ; 
  assert( OriginalMap.SameAs(*readMap) ); 
  Epetra_Export exporter(OriginalMap, *map_);
  Epetra_Export exporter2(OriginalMap, *map_);
  Epetra_Export MatrixExporter(OriginalMap, *map_);
  Epetra_CrsMatrix AwithDiag(Copy, *map_, 0);

  Epetra_Vector x(*map_);
  Epetra_Vector b(*map_);
  Epetra_Vector xexact(*map_);
  Epetra_Vector resid(*map_);
  Epetra_Vector readresid(*readMap);
  Epetra_Vector tmp(*map_);
  Epetra_Vector readtmp(*readMap);

  //  Epetra_Vector xcomp(*map_);      // X as computed by the solver
  bool distribute_matrix = ( matrix_type == AMESOS_Distributed ) ; 
  if ( distribute_matrix ) { 
    // Create Exporter to distribute read-in matrix and vectors
    //  Initialize x, b and xexact to the values read in from the file
    x.Export(*readx, exporter, Add);
    b.Export(*readb, exporter, Add);
    xexact.Export(*readxexact, exporter, Add);
    A.Export(*serialA, exporter, Add);

    passA = &A; 

    passx = &x; 
    passb = &b;
    passxexact = &xexact;
    passresid = &resid;
    passtmp = &tmp;

  } else { 

    passA = serialA; 
    passx = readx; 
    passb = readb;
    passxexact = readxexact;
    passresid = &readresid;
    passtmp = &readtmp;

  Epetra_MultiVector CopyB( *passb ) ;

  double Anorm = passA->NormInf() ; 
  SparseDirectTimingVars::SS_Result.Set_Anorm(Anorm) ;

  Epetra_LinearProblem Problem(  (Epetra_RowMatrix *) passA, 
				 (Epetra_MultiVector *) passx, 
				 (Epetra_MultiVector *) passb );

  for ( int i = 0; i < 1+special ; i++ ) { 
    Epetra_Time TotalTime( Comm ) ; 
    if ( false ) { 
      //  TEST_UMFPACK is never set by configure
    } else if ( SparseSolver == SUPERLUDIST ) {
	Teuchos::ParameterList ParamList ;
	ParamList.set( "MaxProcs", -3 );
	Amesos_Superludist A_Superludist( Problem ) ; 

  //ParamList.set( "Redistribute", true );
  //ParamList.set( "AddZeroToDiag", true );
  Teuchos::ParameterList& SuperludistParams = ParamList.sublist("Superludist") ;
  ParamList.set( "MaxProcs", -3 );

	EPETRA_CHK_ERR( A_Superludist.SetParameters( ParamList ) ); 
	EPETRA_CHK_ERR( A_Superludist.SetUseTranspose( transpose ) ); 
	EPETRA_CHK_ERR( A_Superludist.SymbolicFactorization(  ) ); 
	EPETRA_CHK_ERR( A_Superludist.NumericFactorization(  ) ); 
	EPETRA_CHK_ERR( A_Superludist.Solve(  ) ); 
    } else if ( SparseSolver == DSCPACK ) {
      Teuchos::ParameterList ParamList ;
      ParamList.set( "MaxProcs", -3 );

      Amesos_Dscpack A_dscpack( Problem ) ; 
      EPETRA_CHK_ERR( A_dscpack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_dscpack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_dscpack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_dscpack.Solve(  ) ); 
    } else if ( SparseSolver == SCALAPACK ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Scalapack A_scalapack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_scalapack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_scalapack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_scalapack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_scalapack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_scalapack.Solve(  ) ); 

    } else if ( SparseSolver == TAUCS ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Taucs A_taucs( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_taucs.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_taucs.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_taucs.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_taucs.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_taucs.Solve(  ) ); 

    } else if ( SparseSolver == PARDISO ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Pardiso A_pardiso( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_pardiso.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_pardiso.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_pardiso.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_pardiso.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_pardiso.Solve(  ) ); 

    } else if ( SparseSolver == PARAKLETE ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Paraklete A_paraklete( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_paraklete.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_paraklete.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_paraklete.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_paraklete.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_paraklete.Solve(  ) ); 

    } else if ( SparseSolver == MUMPS ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Mumps A_mumps( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_mumps.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_mumps.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_mumps.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_mumps.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_mumps.Solve(  ) ); 

    } else if ( SparseSolver == SUPERLU ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Superlu A_superlu( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_superlu.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_superlu.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_superlu.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_superlu.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_superlu.Solve(  ) ); 

    } else if ( SparseSolver == LAPACK ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Lapack A_lapack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_lapack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_lapack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_lapack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_lapack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_lapack.Solve(  ) ); 
    } else if ( SparseSolver == UMFPACK ) {

      Teuchos::ParameterList ParamList ;
      Amesos_Umfpack A_umfpack( Problem ) ; 
      ParamList.set( "MaxProcs", -3 );
      EPETRA_CHK_ERR( A_umfpack.SetParameters( ParamList ) ); 
      EPETRA_CHK_ERR( A_umfpack.SetUseTranspose( transpose ) ); 
      EPETRA_CHK_ERR( A_umfpack.SymbolicFactorization(  ) ); 
      EPETRA_CHK_ERR( A_umfpack.NumericFactorization(  ) ); 
      EPETRA_CHK_ERR( A_umfpack.Solve(  ) ); 
    } else if ( SparseSolver == KLU ) {

      using namespace Teuchos;

      Amesos_Time AT; 
      int setupTimePtr = -1, symTimePtr = -1, numTimePtr = -1, refacTimePtr = -1, solveTimePtr = -1;
      AT.CreateTimer(Comm, 2);

      Teuchos::ParameterList ParamList ;
      // ParamList.set("OutputLevel",2);
      Amesos_Klu A_klu( Problem ); 
      ParamList.set( "MaxProcs", -3 );
      ParamList.set( "TrustMe", false );
      // ParamList.set( "Refactorize", true );
      EPETRA_CHK_ERR( A_klu.SetParameters( ParamList ) ) ; 
      EPETRA_CHK_ERR( A_klu.SetUseTranspose( transpose ) ); 
      setupTimePtr = AT.AddTime("Setup", setupTimePtr, 0);
      EPETRA_CHK_ERR( A_klu.SymbolicFactorization(  ) ); 
      symTimePtr = AT.AddTime("Symbolic", symTimePtr, 0);
      EPETRA_CHK_ERR( A_klu.NumericFactorization(  ) ); 
      numTimePtr = AT.AddTime("Numeric", numTimePtr, 0);
      EPETRA_CHK_ERR( A_klu.NumericFactorization(  ) ); 
      refacTimePtr = AT.AddTime("Refactor", refacTimePtr, 0);
      // for ( int i=0; i<100000 ; i++ ) 
      EPETRA_CHK_ERR( A_klu.Solve(  ) ); 
      solveTimePtr = AT.AddTime("Solve", solveTimePtr, 0);

      double SetupTime = AT.GetTime(setupTimePtr);
      double SymbolicTime = AT.GetTime(symTimePtr);
      double NumericTime = AT.GetTime(numTimePtr);
      double RefactorTime = AT.GetTime(refacTimePtr);
      double SolveTime = AT.GetTime(solveTimePtr);

      std::cout << __FILE__ << "::"  << __LINE__ << " SetupTime = " << SetupTime << std::endl ; 
      std::cout << __FILE__ << "::"  << __LINE__ << " SymbolicTime = " << SymbolicTime - SetupTime << std::endl ; 
      std::cout << __FILE__ << "::"  << __LINE__ << " NumericTime = " << NumericTime - SymbolicTime<< std::endl ; 
      std::cout << __FILE__ << "::"  << __LINE__ << " RefactorTime = " << RefactorTime - NumericTime << std::endl ; 
      std::cout << __FILE__ << "::"  << __LINE__ << " SolveTime = " << SolveTime - RefactorTime << std::endl ; 

    } else { 
      SparseDirectTimingVars::log_file << "Solver not implemented yet" << std::endl ;
      std::cerr << "\n\n####################  Requested solver not available on this platform ##################### ATS\n" << std::endl ;
      std::cout << " SparseSolver = " << SparseSolver << std::endl ; 
      std::cerr << " SparseSolver = " << SparseSolver << std::endl ; 
    SparseDirectTimingVars::SS_Result.Set_Total_Time( TotalTime.ElapsedTime() ); 
  }  // end for (int i=0; i<special; i++ ) 

  //  Compute the error = norm(xcomp - xexact )
  double error;
  passresid->Update(1.0, *passx, -1.0, *passxexact, 0.0);

  SparseDirectTimingVars::SS_Result.Set_Error(error) ;

  //  passxexact->Norm2(&error ) ; 
  //  passx->Norm2(&error ) ; 

  //  Compute the residual = norm(Ax - b)
  double residual ; 

  passA->Multiply( transpose, *passx, *passtmp);
  passresid->Update(1.0, *passtmp, -1.0, *passb, 0.0); 
  //  passresid->Update(1.0, *passtmp, -1.0, CopyB, 0.0); 

  SparseDirectTimingVars::SS_Result.Set_Residual(residual) ;
  double bnorm; 
  passb->Norm2( &bnorm ) ; 
  SparseDirectTimingVars::SS_Result.Set_Bnorm(bnorm) ;

  double xnorm; 
  passx->Norm2( &xnorm ) ; 
  SparseDirectTimingVars::SS_Result.Set_Xnorm(xnorm) ;

  delete readA;
  delete readx;
  delete readb;
  delete readxexact;
  delete readMap;
  delete map_;

  return 0;