Example #1
0
Teuchos::RCP<Tpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node> >
build_precond (Teuchos::ParameterList& test_params,
               const Teuchos::RCP<const Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& A)
{
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::OSTab;
  using Teuchos::RCP;
  using Teuchos::rcpFromRef;
  using std::cout;
  using std::endl;
  typedef Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> row_matrix_type;
  Teuchos::Time timer("precond");
  const int myRank = A->getRowMap ()->getComm ()->getRank ();

  RCP<FancyOStream> out = getFancyOStream (rcpFromRef (cout));

  typedef Ifpack2::Preconditioner<Scalar,LocalOrdinal,GlobalOrdinal,Node> Tprec;
  Teuchos::RCP<Tprec> prec;
  Ifpack2::Factory factory;

  std::string prec_name("not specified");
  Ifpack2::getParameter(test_params, "Ifpack2::Preconditioner", prec_name);
  prec = factory.create<row_matrix_type> (prec_name, A);

  Teuchos::ParameterList tif_params;
  if (test_params.isSublist("Ifpack2")) {
    tif_params = test_params.sublist("Ifpack2");
  }

  if (myRank == 0) {
    *out << "Configuring, initializing, and computing Ifpack2 preconditioner" << endl;
  }
  {
    OSTab tab (*out);
    prec->setParameters (tif_params);
    prec->initialize ();
    {
      Teuchos::TimeMonitor timeMon (timer);
      prec->compute ();
    }
    if (myRank == 0) {
      *out << "Finished computing Ifpack2 preconditioner" << endl;
      OSTab tab2 (*out);
      *out << "Time (s): " << timer.totalElapsedTime () << endl;
    }
  }
  if (myRank == 0) {
    *out << "Preconditioner attributes:" << endl;
    OSTab tab (*out);
    prec->describe (*out, Teuchos::VERB_LOW);
  }

  return prec;
}
  void
  DistObjectKA<Packet,LocalOrdinal,GlobalOrdinal,Node>::print (std::ostream &os) const
  {
    using Teuchos::FancyOStream;
    using Teuchos::getFancyOStream;
    using Teuchos::RCP;
    using Teuchos::rcpFromRef;
    using std::endl;

    RCP<FancyOStream> out = getFancyOStream (rcpFromRef (os));
    this->describe (*out, Teuchos::VERB_DEFAULT);
  }
int 
main (int argc, char *argv[]) 
{
  using Teuchos::Comm;
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::oblackholestream;
  using Teuchos::OSTab;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcpFromRef;
  using std::cout;
  using std::endl;
  //
  // Typedefs for Tpetra template arguments.
  //
  typedef double scalar_type;
  typedef long int global_ordinal_type;
  typedef int local_ordinal_type;
  typedef Kokkos::DefaultNode::DefaultNodeType node_type;
  //
  // Tpetra objects which are the MV and OP template parameters of the
  // Belos specialization which we are testing.
  //
  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type> MV;
  typedef Tpetra::Operator<scalar_type, local_ordinal_type, global_ordinal_type, node_type> OP;
  // 
  // Other typedefs.
  // 
  typedef Teuchos::ScalarTraits<scalar_type> STS;
  typedef Tpetra::CrsMatrix<scalar_type, local_ordinal_type, global_ordinal_type, node_type> sparse_matrix_type;

  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &cout);
  RCP<const Comm<int> > comm = Tpetra::DefaultPlatform::getDefaultPlatform().getComm();
  RCP<node_type> node = Tpetra::DefaultPlatform::getDefaultPlatform().getNode();
  RCP<oblackholestream> blackHole (new oblackholestream);
  const int myRank = comm->getRank();

  // Output stream that prints only on Rank 0.
  RCP<FancyOStream> out;
  if (myRank == 0) {
    out = Teuchos::getFancyOStream (rcpFromRef (cout));
  } else {
    out = Teuchos::getFancyOStream (blackHole);
  }

  //
  // Get test parameters from command-line processor.
  //  
  // CommandLineProcessor always understands int, but may not
  // understand global_ordinal_type.  We convert to the latter below.
  int numRows = comm->getSize() * 100;
  bool tolerant = false;
  bool verbose = false;
  bool debug = false;
  Teuchos::CommandLineProcessor cmdp (false, true);
  cmdp.setOption("numRows", &numRows,
		 "Global number of rows (and columns) in the sparse matrix to generate.");
  cmdp.setOption("tolerant", "intolerant", &tolerant,
		 "Whether to parse files tolerantly.");
  cmdp.setOption("verbose", "quiet", &verbose, 
		 "Print messages and results.");
  cmdp.setOption("debug", "release", &debug, 
		 "Run debugging checks and print copious debugging output.");
  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
    *out << "\nEnd Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  }
  // Output stream for verbose output.
  RCP<FancyOStream> verbOut = verbose ? out : getFancyOStream (blackHole);

  const bool success = true;

  // Test whether it's possible to instantiate the solver.
  // This is a minimal compilation test.
  *verbOut << "Instantiating Block GCRODR solver" << endl;
  Belos::BlockGCRODRSolMgr<scalar_type, MV, OP> solver;
  //
  // Test setting solver parameters.  For now, we just use an empty
  // (but non-null) parameter list, which the solver should fill in
  // with defaults.
  //
  *verbOut << "Setting solver parameters" << endl;
  RCP<ParameterList> solverParams = parameterList ();
  solver.setParameters (solverParams);
  //
  // Create a linear system to solve.
  //
  *verbOut << "Creating linear system" << endl;
  RCP<sparse_matrix_type> A;
  RCP<MV> X_guess, X_exact, B;
  {
    typedef Belos::Tpetra::ProblemMaker<sparse_matrix_type> factory_type;
    factory_type factory (comm, node, out, tolerant, debug);
    
    RCP<ParameterList> problemParams = parameterList ();
    problemParams->set ("Global number of rows", 
			static_cast<global_ordinal_type> (numRows));
    problemParams->set ("Problem type", std::string ("Nonsymmetric"));
    factory.makeProblem (A, X_guess, X_exact, B, problemParams);
  }
  // Approximate solution vector is a copy of the guess vector.
  RCP<MV> X (new MV (*X_guess));

  TEUCHOS_TEST_FOR_EXCEPTION(A.is_null(), std::logic_error,
			     "The sparse matrix is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X_guess.is_null(), std::logic_error,
			     "The initial guess X_guess is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X_exact.is_null(), std::logic_error,
			     "The exact solution X_exact is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(B.is_null(), std::logic_error,
			     "The right-hand side B is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X.is_null(), std::logic_error,
			     "The approximate solution vector X is null!");

  typedef Belos::LinearProblem<scalar_type, MV, OP> problem_type;
  RCP<problem_type> problem (new problem_type (A, X, B));
  problem->setProblem ();
  solver.setProblem (problem);

  *verbOut << "Solving linear system" << endl;
  Belos::ReturnType result = solver.solve ();

  *verbOut << "Result of solve: " 
	   << Belos::convertReturnTypeToString (result) 
	   << endl;
  if (success) {
    *out << "\nEnd Result: TEST PASSED" << endl;
    return EXIT_SUCCESS;
  } 
  else {
    *out << "\nEnd Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  }
}
  void Export<LocalOrdinal,GlobalOrdinal,Node>::
  print (std::ostream& os) const
  {
    using Teuchos::Comm;
    using Teuchos::getFancyOStream;
    using Teuchos::RCP;
    using Teuchos::rcpFromRef;
    using Teuchos::toString;
    using std::endl;

    RCP<const Comm<int> > comm = getSourceMap ()->getComm ();
    const int myImageID = comm->getRank ();
    const int numImages = comm->getSize ();
    for (int imageCtr = 0; imageCtr < numImages; ++imageCtr) {
      if (myImageID == imageCtr) {
        os << endl;
        if (myImageID == 0) { // I'm the root node (only output this info once)
          os << "Export Data Members:" << endl;
        }
        os << "Image ID       : " << myImageID << endl;

        os << "permuteFromLIDs: " << toString (getPermuteFromLIDs ()) << endl;
        os << "permuteToLIDs  : " << toString (getPermuteToLIDs ()) << endl;
        os << "remoteLIDs     : " << toString (getRemoteLIDs ()) << endl;
        os << "exportLIDs     : " << toString (getExportLIDs ()) << endl;
        os << "exportPIDs     : " << toString (getExportPIDs ()) << endl;

        os << "numSameIDs     : " << getNumSameIDs () << endl;
        os << "numPermuteIDs  : " << getNumPermuteIDs () << endl;
        os << "numRemoteIDs   : " << getNumRemoteIDs () << endl;
        os << "numExportIDs   : " << getNumExportIDs () << endl;
      }
      // A few global barriers give output a chance to complete.
      comm->barrier();
      comm->barrier();
      comm->barrier();
    }
    if (myImageID == 0) {
      os << endl << endl << "Source Map:" << endl << std::flush;
    }
    comm->barrier();
    os << *getSourceMap();
    comm->barrier();

    if (myImageID == 0) {
      os << endl << endl << "Target Map:" << endl << std::flush;
    }
    comm->barrier();
    os << *getTargetMap();
    comm->barrier();

    // It's also helpful for debugging to print the Distributor
    // object.  Epetra_Export::Print() does this, so we can do a
    // side-by-side comparison.
    if (myImageID == 0) {
      os << endl << endl << "Distributor:" << endl << std::flush;
    }
    comm->barrier();
    getDistributor().describe (*(getFancyOStream (rcpFromRef (os))),
                               Teuchos::VERB_EXTREME);
  }
void
SupportGraph<MatrixType>::
apply (const Tpetra::MultiVector<scalar_type,
                                 local_ordinal_type,
                                 global_ordinal_type,
                                 node_type>& X,
       Tpetra::MultiVector<scalar_type,
                           local_ordinal_type,
                           global_ordinal_type,
                           node_type>& Y,
       Teuchos::ETransp mode,
       scalar_type alpha,
       scalar_type beta) const
{
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::Time;
  using Teuchos::TimeMonitor;
  typedef scalar_type DomainScalar;
  typedef scalar_type RangeScalar;
  typedef Tpetra::MultiVector<DomainScalar, local_ordinal_type,
    global_ordinal_type, node_type> MV;

  RCP<FancyOStream> out = getFancyOStream(rcpFromRef(std::cout));

  // Create a timer for this method, if it doesn't exist already.
  // TimeMonitor::getNewCounter registers the timer, so that
  // TimeMonitor's class methods like summarize() will report the
  // total time spent in successful calls to this method.
  const std::string timerName ("Ifpack2::SupportGraph::apply");
  RCP<Time> timer = TimeMonitor::lookupCounter(timerName);
  if (timer.is_null()) {
    timer = TimeMonitor::getNewCounter(timerName);
  }

  { // Start timing here.
    Teuchos::TimeMonitor timeMon (*timer);

    TEUCHOS_TEST_FOR_EXCEPTION(
      ! isComputed(), std::runtime_error,
      "Ifpack2::SupportGraph::apply: You must call compute() to compute the "
      "incomplete factorization, before calling apply().");

    TEUCHOS_TEST_FOR_EXCEPTION(
      X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
      "Ifpack2::SupportGraph::apply: X and Y must have the same number of "
      "columns.  X has " << X.getNumVectors() << " columns, but Y has "
      << Y.getNumVectors() << " columns.");

    TEUCHOS_TEST_FOR_EXCEPTION(
      beta != STS::zero(), std::logic_error,
      "Ifpack2::SupportGraph::apply: This method does not currently work when "
      "beta != 0.");

    // If X and Y are pointing to the same memory location,
    // we need to create an auxiliary vector, Xcopy
    RCP<const MV> Xcopy;
    if (X.getLocalMV().getValues() == Y.getLocalMV().getValues()) {
      Xcopy = rcp (new MV(X));
    }
    else {
      Xcopy = rcpFromRef(X);
    }

    if (alpha != STS::one()) {
      Y.scale(alpha);
    }

    RCP<MV> Ycopy = rcpFromRef(Y);

    solver_->setB(Xcopy);
    solver_->setX(Ycopy);

    solver_->solve ();
  } // Stop timing here.

  ++NumApply_;

  // timer->totalElapsedTime() returns the total time over all timer
  // calls.  Thus, we use = instead of +=.
  ApplyTime_ = timer->totalElapsedTime();
}
int
main (int argc, char *argv[])
{
  using namespace TrilinosCouplings; // Yes, this means I'm lazy.

  using TpetraIntrepidPoissonExample::exactResidualNorm;
  using TpetraIntrepidPoissonExample::makeMatrixAndRightHandSide;
  using TpetraIntrepidPoissonExample::solveWithBelos;
  using TpetraIntrepidPoissonExample::solveWithBelosGPU;
  using IntrepidPoissonExample::makeMeshInput;
  using IntrepidPoissonExample::parseCommandLineArguments;
  using IntrepidPoissonExample::setCommandLineArgumentDefaults;
  using IntrepidPoissonExample::setMaterialTensorOffDiagonalValue;
  using IntrepidPoissonExample::setUpCommandLineArguments;
  using Tpetra::DefaultPlatform;
  using Teuchos::Comm;
  using Teuchos::outArg;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::getFancyOStream;
  using Teuchos::FancyOStream;
  using std::endl;
  // Pull in typedefs from the example's namespace.
  typedef TpetraIntrepidPoissonExample::ST ST;
#ifdef HAVE_TRILINOSCOUPLINGS_MUELU
  typedef TpetraIntrepidPoissonExample::LO LO;
  typedef TpetraIntrepidPoissonExample::GO GO;
#endif // HAVE_TRILINOSCOUPLINGS_MUELU
  typedef TpetraIntrepidPoissonExample::Node Node;
  typedef Teuchos::ScalarTraits<ST> STS;
  typedef STS::magnitudeType MT;
  typedef Teuchos::ScalarTraits<MT> STM;
  typedef TpetraIntrepidPoissonExample::sparse_matrix_type sparse_matrix_type;
  typedef TpetraIntrepidPoissonExample::vector_type vector_type;
  typedef TpetraIntrepidPoissonExample::operator_type operator_type;

  bool success = true;
  try {
    Teuchos::oblackholestream blackHole;
    Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
    const int myRank = mpiSession.getRank ();
    //const int numProcs = mpiSession.getNProc ();

    // Get the default communicator and Kokkos Node instance
    RCP<const Comm<int> > comm =
      DefaultPlatform::getDefaultPlatform ().getComm ();
    RCP<Node> node = DefaultPlatform::getDefaultPlatform ().getNode ();

    // Did the user specify --help at the command line to print help
    // with command-line arguments?
    bool printedHelp = false;
    // Values of command-line arguments.
    int nx, ny, nz;
    std::string xmlInputParamsFile;
    bool verbose, debug;
    int maxNumItersFromCmdLine = -1; // -1 means "read from XML file"
    double tolFromCmdLine = -1.0; // -1 means "read from XML file"
    std::string solverName = "GMRES";
    ST materialTensorOffDiagonalValue = 0.0;

    // Set default values of command-line arguments.
    setCommandLineArgumentDefaults (nx, ny, nz, xmlInputParamsFile,
                                    solverName, verbose, debug);
    // Parse and validate command-line arguments.
    Teuchos::CommandLineProcessor cmdp (false, true);
    setUpCommandLineArguments (cmdp, nx, ny, nz, xmlInputParamsFile,
                               solverName, tolFromCmdLine,
                               maxNumItersFromCmdLine,
                               verbose, debug);
    cmdp.setOption ("materialTensorOffDiagonalValue",
                    &materialTensorOffDiagonalValue, "Off-diagonal value in "
                    "the material tensor.  This controls the iteration count.  "
                    "Be careful with this if you use CG, since you can easily "
                    "make the matrix indefinite.");

    // Additional command-line arguments for GPU experimentation.
    bool gpu = false;
    cmdp.setOption ("gpu", "no-gpu", &gpu,
                    "Run example using GPU node (if supported)");
    int ranks_per_node = 1;
    cmdp.setOption ("ranks_per_node", &ranks_per_node,
                    "Number of MPI ranks per node");
    int gpu_ranks_per_node = 1;
    cmdp.setOption ("gpu_ranks_per_node", &gpu_ranks_per_node,
                    "Number of MPI ranks per node for GPUs");
    int device_offset = 0;
    cmdp.setOption ("device_offset", &device_offset,
                    "Offset for attaching MPI ranks to CUDA devices");

    // Additional command-line arguments for dumping the generated
    // matrix or its row Map to output files.
    //
    // FIXME (mfh 09 Apr 2014) Need to port these command-line
    // arguments to the Epetra version.

    // If matrixFilename is nonempty, dump the matrix to that file
    // in MatrixMarket format.
    std::string matrixFilename;
    cmdp.setOption ("matrixFilename", &matrixFilename, "If nonempty, dump the "
                    "generated matrix to that file in MatrixMarket format.");

    // If rowMapFilename is nonempty, dump the matrix's row Map to
    // that file in MatrixMarket format.
    std::string rowMapFilename;
    cmdp.setOption ("rowMapFilename", &rowMapFilename, "If nonempty, dump the "
                    "generated matrix's row Map to that file in a format that "
                    "Tpetra::MatrixMarket::Reader can read.");
    // Option to exit after building A and b (and dumping stuff to
    // files, if requested).
    bool exitAfterAssembly = false;
    cmdp.setOption ("exitAfterAssembly", "dontExitAfterAssembly",
                    &exitAfterAssembly, "If true, exit after building the "
                    "sparse matrix and dense right-hand side vector.  If either"
                    " --matrixFilename or --rowMapFilename are nonempty strings"
                    ", dump the matrix resp. row Map to their respective files "
                    "before exiting.");

    parseCommandLineArguments (cmdp, printedHelp, argc, argv, nx, ny, nz,
                               xmlInputParamsFile, solverName, verbose, debug);
    if (printedHelp) {
      // The user specified --help at the command line to print help
      // with command-line arguments.  We printed help already, so quit
      // with a happy return code.
      return EXIT_SUCCESS;
    }

    setMaterialTensorOffDiagonalValue (materialTensorOffDiagonalValue);

    // Both streams only print on MPI Rank 0.  "out" only prints if the
    // user specified --verbose.
    RCP<FancyOStream> out =
      getFancyOStream (rcpFromRef ((myRank == 0 && verbose) ? std::cout : blackHole));
    RCP<FancyOStream> err =
      getFancyOStream (rcpFromRef ((myRank == 0 && debug) ? std::cerr : blackHole));

#ifdef HAVE_MPI
    *out << "PARALLEL executable" << endl;
#else
    *out << "SERIAL executable" << endl;
#endif

    /**********************************************************************************/
    /********************************** GET XML INPUTS ********************************/
    /**********************************************************************************/
    ParameterList inputList;
    if (xmlInputParamsFile != "") {
      *out << "Reading parameters from XML file \""
           << xmlInputParamsFile << "\"..." << endl;
      Teuchos::updateParametersFromXmlFile (xmlInputParamsFile,
                                            outArg (inputList));
      if (myRank == 0) {
        inputList.print (*out, 2, true, true);
        *out << endl;
      }
    }

    // Get Pamgen mesh definition string, either from the input
    // ParameterList or from our function that makes a cube and fills in
    // the number of cells along each dimension.
    std::string meshInput = inputList.get("meshInput", "");
    if (meshInput == "") {
      *out << "Generating mesh input string: nx = " << nx
           << ", ny = " << ny
           << ", nz = " << nz << endl;
      meshInput = makeMeshInput (nx, ny, nz);
    }

    // Total application run time
    {
      TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Time", total_time);

      RCP<sparse_matrix_type> A;
      RCP<vector_type> B, X_exact, X;
      {
        TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Assembly", total_assembly);
        makeMatrixAndRightHandSide (A, B, X_exact, X, comm, node, meshInput,
                                    out, err, verbose, debug);
      }

      // Optionally dump the matrix and/or its row Map to files.
      {
        typedef Tpetra::MatrixMarket::Writer<sparse_matrix_type> writer_type;
        if (matrixFilename != "") {
          writer_type::writeSparseFile (matrixFilename, A);
        }
        if (rowMapFilename != "") {
          writer_type::writeMapFile (rowMapFilename, * (A->getRowMap ()));
        }
      }

      if (exitAfterAssembly) {
        // Users might still be interested in assembly time.
        Teuchos::TimeMonitor::report (comm.ptr (), std::cout);
        return EXIT_SUCCESS;
      }

      const std::vector<MT> norms = exactResidualNorm (A, B, X_exact);
      // X_exact is the exact solution of the PDE, projected onto the
      // discrete mesh.  It may not necessarily equal the exact solution
      // of the linear system.
      *out << "||B - A*X_exact||_2 = " << norms[0] << endl
           << "||B||_2 = " << norms[1] << endl
           << "||A||_F = " << norms[2] << endl;

      // Setup preconditioner
      std::string prec_type = inputList.get ("Preconditioner", "None");
      RCP<operator_type> M;
      {
        TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Preconditioner Setup", total_prec);

        if (prec_type == "MueLu") {
#ifdef HAVE_TRILINOSCOUPLINGS_MUELU
          if (inputList.isSublist("MueLu")) {
            ParameterList mueluParams = inputList.sublist("MueLu");
            M = MueLu::CreateTpetraPreconditioner<ST,LO,GO,Node>(A,mueluParams);
          } else {
            M = MueLu::CreateTpetraPreconditioner<ST,LO,GO,Node>(A);
          }
#else // NOT HAVE_TRILINOSCOUPLINGS_MUELU
          TEUCHOS_TEST_FOR_EXCEPTION(
            prec_type == "MueLu", std::runtime_error, "Tpetra scaling example: "
            "In order to precondition with MueLu, you must have built Trilinos "
            "with the MueLu package enabled.");
#endif // HAVE_TRILINOSCOUPLINGS_MUELU
        }
      } // setup preconditioner

      // Get the convergence tolerance for each linear solve.
      // If the user provided a nonnegative value at the command
      // line, it overrides any value in the input ParameterList.
      MT tol = STM::squareroot (STM::eps ()); // default value
      if (tolFromCmdLine < STM::zero ()) {
        tol = inputList.get ("Convergence Tolerance", tol);
      } else {
        tol = tolFromCmdLine;
      }

      // Get the maximum number of iterations for each linear solve.
      // If the user provided a value other than -1 at the command
      // line, it overrides any value in the input ParameterList.
      int maxNumIters = 200; // default value
      if (maxNumItersFromCmdLine == -1) {
        maxNumIters = inputList.get ("Maximum Iterations", maxNumIters);
      } else {
        maxNumIters = maxNumItersFromCmdLine;
      }

      // Get the number of "time steps."  We imitate a time-dependent
      // PDE by doing this many linear solves.
      const int num_steps = inputList.get ("Number of Time Steps", 1);

      // Do the linear solve(s).
      bool converged = false;
      int numItersPerformed = 0;
      if (gpu) {
        TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total GPU Solve", total_solve);
        solveWithBelosGPU (converged, numItersPerformed, tol, maxNumIters,
                           num_steps, ranks_per_node, gpu_ranks_per_node,
                           device_offset, prec_type, X, A, B, Teuchos::null, M);
      }
      else {
        TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Solve", total_solve);
        solveWithBelos (converged, numItersPerformed, solverName, tol,
                        maxNumIters, num_steps, X, A, B, Teuchos::null, M);
      }

      // Compute ||X-X_exact||_2
      const MT norm_x = X_exact->norm2 ();
      X_exact->update (-1.0, *X, 1.0);
      const MT norm_error = X_exact->norm2 ();
      *out << endl
           << "||X - X_exact||_2 / ||X_exact||_2 = " << norm_error / norm_x
           << endl;
    } // total time block

    // Summarize timings
    Teuchos::TimeMonitor::report (comm.ptr (), std::cout);
  } // try
  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);

  if (success) {
    return EXIT_SUCCESS;
  } else {
    return EXIT_FAILURE;
  }
}
Example #7
0
int 
main (int argc, char *argv[])
{
  using Teuchos::ArrayRCP;
  using Teuchos::ArrayView;
  using Teuchos::Comm;
  using Teuchos::CommandLineProcessor;
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::OSTab;
  using Teuchos::ptr;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using std::cout;
  using std::endl;

  bool success = true; // May be changed by tests

  Teuchos::oblackholestream blackHole;
  //Teuchos::GlobalMPISession (&argc, &argv, &blackHole);
  MPI_Init (&argc, &argv);

  //
  // Construct communicators, and verify that we are on 4 processors.
  //

  // Construct a Teuchos Comm object.
  RCP<const Comm<int> > teuchosComm = Teuchos::DefaultComm<int>::getComm();
  const int numProcs = teuchosComm->getSize();
  const int pid = teuchosComm->getRank();
  RCP<FancyOStream> pOut = 
    getFancyOStream (rcpFromRef ((pid == 0) ? std::cout : blackHole));
  FancyOStream& out = *pOut;
  // Verify that we are on four processors (which manifests the bug).
  if (teuchosComm->getSize() != 4) {
    out << "This test must be run on four processors.  Exiting ..." << endl;
    return EXIT_FAILURE;
  }

  // We also need an Epetra Comm, so that we can compare Tpetra and
  // Epetra results.
  Epetra_MpiComm epetraComm (MPI_COMM_WORLD);

  //
  // Default values of command-line options.
  //
  bool verbose = false;
  bool printEpetra = false;
  bool printTpetra = false;
  CommandLineProcessor cmdp (false,true);
  //
  // Set command-line options.
  //
  cmdp.setOption ("verbose", "quiet", &verbose, "Print verbose output.");
  // Epetra and Tpetra output will ask the Maps and Import objects to
  // print themselves in distributed, maximally verbose fashion.  It's
  // best to turn on either Epetra or Tpetra, but not both.  Then you
  // can compare their output side by side.
  cmdp.setOption ("printEpetra", "dontPrintEpetra", &printEpetra, 
		  "Print Epetra output (in verbose mode only).");
  cmdp.setOption ("printTpetra", "dontPrintTpetra", &printTpetra, 
		  "Print Tpetra output (in verbose mode only).");
  // Parse command-line options.
  if (cmdp.parse (argc,argv) != CommandLineProcessor::PARSE_SUCCESSFUL) {
    out << "End Result: TEST FAILED" << endl;
    MPI_Finalize ();
    return EXIT_FAILURE;
  }

  if (verbose) {
    out << "Running test on " << numProcs << " process" 
	<< (numProcs != 1 ? "es" : "") << "." << endl;
  }

  // The maps for this problem are derived from a 3D structured mesh.
  // In this example, the dimensions are 4x4x2 and there are 2
  // processors assigned to the first dimension and 2 processors
  // assigned to the second dimension, with no parallel decomposition
  // along the third dimension.  The "owned" arrays represent the
  // one-to-one map, with each array representing a 2x2x2 slice.  If
  // DIMENSIONS == 2, then only the first 4 values will be used,
  // representing a 2x2(x1) slice.
  int owned0[8] = { 0, 1, 4, 5,16,17,20,21};
  int owned1[8] = { 2, 3, 6, 7,18,19,22,23};
  int owned2[8] = { 8, 9,12,13,24,25,28,29};
  int owned3[8] = {10,11,14,15,26,27,30,31};

  // The "overlap" arrays represent the map with communication
  // elements, with each array representing a 3x3x2 slice.  If
  // DIMENSIONS == 2, then only the first 9 values will be used,
  // representing a 3x3(x1) slice.
  int overlap0[18] = {0,1,2,4, 5, 6, 8, 9,10,16,17,18,20,21,22,24,25,26};
  int overlap1[18] = {1,2,3,5, 6, 7, 9,10,11,17,18,19,21,22,23,25,26,27};
  int overlap2[18] = {4,5,6,8, 9,10,12,13,14,20,21,22,24,25,26,28,29,30};
  int overlap3[18] = {5,6,7,9,10,11,13,14,15,21,22,23,25,26,27,29,30,31};

  // Construct the owned and overlap maps for both Epetra and Tpetra.
  int* owned;
  int* overlap;
  if (pid == 0) {
    owned   = owned0;
    overlap = overlap0;
  }
  else if (pid == 1) {
    owned   = owned1;
    overlap = overlap1;
  }
  else if (pid == 2) {
    owned   = owned2;
    overlap = overlap2;
  }
  else {
    owned   = owned3;
    overlap = overlap3;
  }

#if DIMENSIONS == 2
  int ownedSize   = 4;
  int overlapSize = 9;
#elif DIMENSIONS == 3
  int ownedSize   =  8;
  int overlapSize = 18;
#endif

  // Create the two Epetra Maps.  Source for the Import is the owned
  // map; target for the Import is the overlap map.
  Epetra_Map epetraOwnedMap (  -1, ownedSize,   owned,   0, epetraComm);
  Epetra_Map epetraOverlapMap (-1, overlapSize, overlap, 0, epetraComm);

  if (verbose && printEpetra) {
    // Have the Epetra_Map objects describe themselves.
    //
    // Epetra_BlockMap::Print() takes an std::ostream&, and expects
    // all MPI processes to be able to write to it.  (The method
    // handles its own synchronization.)
    out << "Epetra owned map:" << endl;
    epetraOwnedMap.Print (std::cout);
    out << "Epetra overlap map:" << endl;
    epetraOverlapMap.Print (std::cout);
  }

  // Create the two Tpetra Maps.  The "invalid" global element count
  // input tells Tpetra::Map to compute the global number of elements
  // itself.
  const int invalid = Teuchos::OrdinalTraits<int>::invalid();
  RCP<Tpetra::Map<int> > tpetraOwnedMap = 
    rcp (new Tpetra::Map<int> (invalid, ArrayView<int> (owned, ownedSize), 
			       0, teuchosComm));
  tpetraOwnedMap->setObjectLabel ("Owned Map");
  RCP<Tpetra::Map<int> > tpetraOverlapMap =
    rcp (new Tpetra::Map<int> (invalid, ArrayView<int> (overlap, overlapSize),
			       0, teuchosComm));
  tpetraOverlapMap->setObjectLabel ("Overlap Map");

  // In verbose mode, have the Tpetra::Map objects describe themselves.
  if (verbose && printTpetra) {
    Teuchos::EVerbosityLevel verb = Teuchos::VERB_EXTREME;

    // Tpetra::Map::describe() takes a FancyOStream, but expects all
    // MPI processes to be able to write to it.  (The method handles
    // its own synchronization.)
    RCP<FancyOStream> globalOut = getFancyOStream (rcpFromRef (std::cout));
    out << "Tpetra owned map:" << endl;
    {
      OSTab tab (globalOut);
      tpetraOwnedMap->describe (*globalOut, verb);
    }
    out << "Tpetra overlap map:" << endl;
    {
      OSTab tab (globalOut);
      tpetraOverlapMap->describe (*globalOut, verb);
    }
  }

  // Use the owned and overlap maps to construct an importer for both
  // Epetra and Tpetra.
  Epetra_Import       epetraImporter (epetraOverlapMap, epetraOwnedMap  );
  Tpetra::Import<int> tpetraImporter (tpetraOwnedMap  , tpetraOverlapMap);

  // In verbose mode, have the Epetra_Import object describe itself.
  if (verbose && printEpetra) {
    out << "Epetra importer:" << endl;
    // The importer's Print() method takes an std::ostream& and plans
    // to write to it on all MPI processes (handling synchronization
    // itself).
    epetraImporter.Print (std::cout);
    out << endl;
  }

  // In verbose mode, have the Tpetra::Import object describe itself.
  if (verbose && printTpetra) {
    out << "Tpetra importer:" << endl;
    // The importer doesn't implement Teuchos::Describable.  It wants
    // std::cout and plans to write to it on all MPI processes (with
    // its own synchronization).
    tpetraImporter.print (std::cout);
    out << endl;
  }

  // Construct owned and overlap vectors for both Epetra and Tpetra.
  Epetra_Vector epetraOwnedVector   (epetraOwnedMap  );
  Epetra_Vector epetraOverlapVector (epetraOverlapMap);
  Tpetra::Vector<double,int> tpetraOwnedVector   (tpetraOwnedMap  );
  Tpetra::Vector<double,int> tpetraOverlapVector (tpetraOverlapMap);

  // The test is as follows: initialize the owned and overlap vectors
  // with global IDs in the owned regions.  Initialize the overlap
  // vectors to equal -1 in the overlap regions.  Then perform a
  // communication from the owned vectors to the overlap vectors.  The
  // resulting overlap vectors should have global IDs everywhere and
  // all of the -1 values should be overwritten.

  // Initialize.  We cannot assign directly to the Tpetra Vectors;
  // instead, we extract nonconst views and assign to those.  The
  // results aren't guaranteed to be committed to the vector unless
  // the views are released (by assigning Teuchos::null to them).
  epetraOverlapVector.PutScalar(-1);
  tpetraOverlapVector.putScalar(-1);
  ArrayRCP<double> tpetraOwnedArray   = tpetraOwnedVector.getDataNonConst(0);
  ArrayRCP<double> tpetraOverlapArray = tpetraOverlapVector.getDataNonConst(0);
  for (int owned_lid = 0; 
       owned_lid < tpetraOwnedMap->getNodeElementList().size(); 
       ++owned_lid) {
    int gid         = tpetraOwnedMap->getGlobalElement(owned_lid);
    int overlap_lid = tpetraOverlapMap->getLocalElement(gid);
    epetraOwnedVector[owned_lid]     = gid;
    epetraOverlapVector[overlap_lid] = gid;
    tpetraOwnedArray[owned_lid]      = gid;
    tpetraOverlapArray[overlap_lid]  = gid;
  }
  // Make sure that the changes to the Tpetra Vector were committed,
  // by releasing the nonconst views.
  tpetraOwnedArray = Teuchos::null;
  tpetraOverlapArray = Teuchos::null;

  // Test the Epetra and Tpetra Import.
  if (verbose) {
    out << "Testing Import from owned Map to overlap Map:" << endl << endl;
  }
  epetraOverlapVector.Import(  epetraOwnedVector, epetraImporter, Insert);
  tpetraOverlapVector.doImport(tpetraOwnedVector, tpetraImporter, 
			       Tpetra::INSERT);
  // Check the Import results.
  success = countFailures (teuchosComm, epetraOwnedMap, epetraOwnedVector, 
			   epetraOverlapMap, epetraOverlapVector, 
			   tpetraOwnedMap, tpetraOwnedVector, 
			   tpetraOverlapMap, tpetraOverlapVector, verbose);

  const bool testOtherDirections = false;
  if (testOtherDirections) {
    //
    // Reinitialize the Tpetra vectors and test whether Export works.
    //
    tpetraOverlapVector.putScalar(-1);
    tpetraOwnedArray   = tpetraOwnedVector.getDataNonConst(0);
    tpetraOverlapArray = tpetraOverlapVector.getDataNonConst(0);
    for (int owned_lid = 0; 
	 owned_lid < tpetraOwnedMap->getNodeElementList().size(); 
	 ++owned_lid) 
      {
	int gid         = tpetraOwnedMap->getGlobalElement(owned_lid);
	int overlap_lid = tpetraOverlapMap->getLocalElement(gid);
	tpetraOwnedArray[owned_lid]      = gid;
	tpetraOverlapArray[overlap_lid]  = gid;
      }
    // Make sure that the changes to the Tpetra Vector were committed,
    // by releasing the nonconst views.
    tpetraOwnedArray = Teuchos::null;
    tpetraOverlapArray = Teuchos::null;

    // Make a Tpetra Export object, and test the export.
    Tpetra::Export<int> tpetraExporter1 (tpetraOwnedMap, tpetraOverlapMap);
    if (verbose) {
      out << "Testing Export from owned Map to overlap Map:" << endl << endl;
    }
    tpetraOverlapVector.doExport (tpetraOwnedVector, tpetraExporter1, 
				  Tpetra::INSERT);

    // Check the Export results.
    success = countFailures (teuchosComm, epetraOwnedMap, epetraOwnedVector, 
			     epetraOverlapMap, epetraOverlapVector, 
			     tpetraOwnedMap, tpetraOwnedVector, 
			     tpetraOverlapMap, tpetraOverlapVector, verbose);
    //
    // Reinitialize the Tpetra vectors and see what Import in the
    // other direction does.
    //
    tpetraOverlapVector.putScalar(-1);
    tpetraOwnedArray   = tpetraOwnedVector.getDataNonConst(0);
    tpetraOverlapArray = tpetraOverlapVector.getDataNonConst(0);
    for (int owned_lid = 0; 
	 owned_lid < tpetraOwnedMap->getNodeElementList().size(); 
	 ++owned_lid) 
      {
	int gid         = tpetraOwnedMap->getGlobalElement(owned_lid);
	int overlap_lid = tpetraOverlapMap->getLocalElement(gid);
	tpetraOwnedArray[owned_lid]      = gid;
	tpetraOverlapArray[overlap_lid]  = gid;
      }
    // Make sure that the changes to the Tpetra Vector were committed,
    // by releasing the nonconst views.
    tpetraOwnedArray = Teuchos::null;
    tpetraOverlapArray = Teuchos::null;

    if (verbose) {
      out << "Testing Import from overlap Map to owned Map:" << endl << endl;
    }
    Tpetra::Import<int> tpetraImporter2 (tpetraOverlapMap, tpetraOwnedMap);
    tpetraOwnedVector.doImport (tpetraOverlapVector, tpetraImporter2, 
				Tpetra::INSERT);
    // Check the Import results.
    success = countFailures (teuchosComm, epetraOwnedMap, epetraOwnedVector, 
			     epetraOverlapMap, epetraOverlapVector, 
			     tpetraOwnedMap, tpetraOwnedVector, 
			     tpetraOverlapMap, tpetraOverlapVector, verbose);
  } // if testOtherDirections

  out << "End Result: TEST " << (success ? "PASSED" : "FAILED") << endl;
  MPI_Finalize ();
  return success ? EXIT_SUCCESS : EXIT_FAILURE;
}
Example #8
0
int 
main (int argc, char *argv[]) 
{
  using Teuchos::as;
  using Teuchos::Comm;
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using std::cerr;
  using std::cout;
  using std::endl;

  typedef double scalar_type;
  typedef int local_ordinal_type;
#if defined(HAVE_TPETRA_EXPLICIT_INSTANTIATION) && defined(HAVE_TPETRA_INST_INT_LONG)
  typedef long global_ordinal_type;
#else
  typedef int  global_ordinal_type;
#endif
  typedef Kokkos::SerialNode node_type;

  Teuchos::oblackholestream blackHole;
  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
  RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();

  //
  // Read in command line arguments.
  //  
  int unknownsPerNode = 20; // number of unknowns per process
  int unknownsPerElt = 3; // number of unknowns per (overlapping) element
  int numCols = 1;
  bool verbose = false;

  Teuchos::CommandLineProcessor cmdp (false, true);
  cmdp.setOption ("unknownsPerNode", &unknownsPerNode, 
		  "Number of unknowns per process");
  cmdp.setOption ("unknownsPerElt", &unknownsPerElt, 
		  "Number of unknowns per (overlapping) element.");
  cmdp.setOption ("numCols", &numCols, 
		  "Number of columns in the multivector.  Must be positive.");
  cmdp.setOption ("verbose", "quiet", &verbose, 
		  "Whether to print verbose output.");
  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
    return EXIT_FAILURE;
  }

  const Teuchos::EVerbosityLevel verbLevel = 
    verbose ? Teuchos::VERB_EXTREME : Teuchos::VERB_DEFAULT;
  RCP<FancyOStream> out = verbose ? getFancyOStream (rcpFromRef (std::cout)) :
    getFancyOStream (rcpFromRef (blackHole));

  RCP<ParameterList> nodeParams = parameterList ("Kokkos Node");
  RCP<node_type> node = makeSerialNode (nodeParams);

  // Run the test.
  bool succeeded = true;
  try {
    Tpetra::Test::testMultiVectorFiller<scalar_type, 
      local_ordinal_type, 
      global_ordinal_type, 
      node_type> (comm, node, as<size_t> (unknownsPerNode),
		  as<global_ordinal_type> (unknownsPerElt), 
		  as<size_t> (numCols), out, verbLevel);
    succeeded = true;
  } catch (std::exception& e) {
    *out << "MultiVectorFiller test threw an exception:  " << e.what() << endl;
    succeeded = false;
  }

  const int localSuccess = succeeded ? 1 : 0;
  int globalSuccess = localSuccess;
  Teuchos::reduceAll (*comm, Teuchos::REDUCE_SUM, localSuccess, 
		      Teuchos::ptr (&globalSuccess));
  
  if (globalSuccess) {
    std::cout << "End Result: TEST PASSED" << endl;
    return EXIT_SUCCESS;
  }
  else {
    std::cout << "End Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  }
}
int
main (int argc, char *argv[])
{
  using namespace TrilinosCouplings; // Yes, this means I'm lazy.

  using TpetraIntrepidPoissonExample::exactResidualNorm;
  using TpetraIntrepidPoissonExample::makeMatrixAndRightHandSide;
  using TpetraIntrepidPoissonExample::solveWithBelos;
  using TpetraIntrepidPoissonExample::solveWithBelosGPU;
  using IntrepidPoissonExample::makeMeshInput;
  using IntrepidPoissonExample::setCommandLineArgumentDefaults;
  using IntrepidPoissonExample::setUpCommandLineArguments;
  using IntrepidPoissonExample::parseCommandLineArguments;
  using Tpetra::DefaultPlatform;
  using Teuchos::Comm;
  using Teuchos::outArg;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::getFancyOStream;
  using Teuchos::FancyOStream;
  using std::endl;
  // Pull in typedefs from the example's namespace.
  typedef TpetraIntrepidPoissonExample::ST ST;
  typedef TpetraIntrepidPoissonExample::LO LO;
  typedef TpetraIntrepidPoissonExample::GO GO;
  typedef TpetraIntrepidPoissonExample::Node Node;
  typedef Teuchos::ScalarTraits<ST> STS;
  typedef STS::magnitudeType MT;
  typedef Teuchos::ScalarTraits<MT> STM;
  typedef TpetraIntrepidPoissonExample::sparse_matrix_type sparse_matrix_type;
  typedef TpetraIntrepidPoissonExample::vector_type vector_type;
  typedef TpetraIntrepidPoissonExample::operator_type operator_type;

  bool success = true;
  try {

  Teuchos::oblackholestream blackHole;
  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
  const int myRank = mpiSession.getRank ();
  //const int numProcs = mpiSession.getNProc ();

  // Get the default communicator and Kokkos Node instance
  RCP<const Comm<int> > comm =
    DefaultPlatform::getDefaultPlatform ().getComm ();
  RCP<Node> node = DefaultPlatform::getDefaultPlatform ().getNode ();

  // Did the user specify --help at the command line to print help
  // with command-line arguments?
  bool printedHelp = false;
  // Values of command-line arguments.
  int nx, ny, nz;
  std::string xmlInputParamsFile;
  bool verbose, debug;

  // Set default values of command-line arguments.
  setCommandLineArgumentDefaults (nx, ny, nz, xmlInputParamsFile,
                                  verbose, debug);
  // Parse and validate command-line arguments.
  Teuchos::CommandLineProcessor cmdp (false, true);
  setUpCommandLineArguments (cmdp, nx, ny, nz, xmlInputParamsFile,
                             verbose, debug);
  bool gpu = false;
  cmdp.setOption ("gpu", "no-gpu", &gpu,
                  "Run example using GPU node (if supported)");
  int ranks_per_node = 1;
  cmdp.setOption("ranks_per_node", &ranks_per_node,
                 "Number of MPI ranks per node");
  int gpu_ranks_per_node = 1;
  cmdp.setOption("gpu_ranks_per_node", &gpu_ranks_per_node,
                 "Number of MPI ranks per node for GPUs");
  int device_offset = 0;
  cmdp.setOption("device_offset", &device_offset,
                 "Offset for attaching MPI ranks to CUDA devices");
  parseCommandLineArguments (cmdp, printedHelp, argc, argv, nx, ny, nz,
                             xmlInputParamsFile, verbose, debug);
  if (printedHelp) {
    // The user specified --help at the command line to print help
    // with command-line arguments.  We printed help already, so quit
    // with a happy return code.
    return EXIT_SUCCESS;
  }

  // Both streams only print on MPI Rank 0.  "out" only prints if the
  // user specified --verbose.
  RCP<FancyOStream> out =
    getFancyOStream (rcpFromRef ((myRank == 0 && verbose) ? std::cout : blackHole));
  RCP<FancyOStream> err =
    getFancyOStream (rcpFromRef ((myRank == 0 && debug) ? std::cerr : blackHole));

#ifdef HAVE_MPI
  *out << "PARALLEL executable" << endl;
#else
  *out << "SERIAL executable" << endl;
#endif

/**********************************************************************************/
/********************************** GET XML INPUTS ********************************/
/**********************************************************************************/
  ParameterList inputList;
  if (xmlInputParamsFile != "") {
    *out << "Reading parameters from XML file \""
         << xmlInputParamsFile << "\"..." << endl;
    Teuchos::updateParametersFromXmlFile (xmlInputParamsFile,
                                          outArg (inputList));
    if (myRank == 0) {
      inputList.print (*out, 2, true, true);
      *out << endl;
    }
  }

  // Get Pamgen mesh definition string, either from the input
  // ParameterList or from our function that makes a cube and fills in
  // the number of cells along each dimension.
  std::string meshInput = inputList.get("meshInput", "");
  if (meshInput == "") {
    *out << "Generating mesh input string: nx = " << nx
         << ", ny = " << ny
         << ", nz = " << nz << endl;
    meshInput = makeMeshInput (nx, ny, nz);
  }

  // Total application run time
  {
  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Time", total_time);

  RCP<sparse_matrix_type> A;
  RCP<vector_type> B, X_exact, X;
  {
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Assembly", total_assembly);
    makeMatrixAndRightHandSide (A, B, X_exact, X, comm, node, meshInput,
                                out, err, verbose, debug);
  }

  const std::vector<MT> norms = exactResidualNorm (A, B, X_exact);
  // X_exact is the exact solution of the PDE, projected onto the
  // discrete mesh.  It may not necessarily equal the exact solution
  // of the linear system.
  *out << "||B - A*X_exact||_2 = " << norms[0] << endl
       << "||B||_2 = " << norms[1] << endl
       << "||A||_F = " << norms[2] << endl;

  // Setup preconditioner
  std::string prec_type = inputList.get("Preconditioner", "None");
  RCP<operator_type> M;
  {
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Preconditioner Setup", total_prec);

    if (prec_type == "MueLu") {
      if (inputList.isSublist("MueLu")) {
        ParameterList mueluParams = inputList.sublist("MueLu");
        M = MueLu::CreateTpetraPreconditioner<ST,LO,GO,Node>(A,mueluParams);
      } else {
        M = MueLu::CreateTpetraPreconditioner<ST,LO,GO,Node>(A);
      }
    }
  }

  bool converged = false;
  int numItersPerformed = 0;
  const MT tol = inputList.get("Convergence Tolerance",
                               STM::squareroot (STM::eps ()));
  const int maxNumIters = inputList.get("Maximum Iterations", 200);
  const int num_steps = inputList.get("Number of Time Steps", 1);
  if (gpu) {
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total GPU Solve", total_solve);
    solveWithBelosGPU(converged, numItersPerformed, tol, maxNumIters, num_steps,
                      ranks_per_node, gpu_ranks_per_node, device_offset,
                      prec_type,
                      X, A, B, Teuchos::null, M);
  }
  else {
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Total Solve", total_solve);
    solveWithBelos (converged, numItersPerformed, tol, maxNumIters, num_steps,
                    X, A, B, Teuchos::null, M);
  }

  // Compute ||X-X_exact||_2
  MT norm_x = X_exact->norm2();
  X_exact->update(-1.0, *X, 1.0);
  MT norm_error = X_exact->norm2();
  *out << endl
       << "||X-X_exact||_2 / ||X_exact||_2 = " << norm_error / norm_x
       << endl;

  } // total time block

  // Summarize timings
  // RCP<ParameterList> reportParams = parameterList ("TimeMonitor::report");
  // reportParams->set ("Report format", std::string ("YAML"));
  // reportParams->set ("writeGlobalStats", true);
  // Teuchos::TimeMonitor::report (*out, reportParams);
  Teuchos::TimeMonitor::summarize(std::cout);

  } //try
  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);

  if (success)
    return EXIT_SUCCESS;
  return EXIT_FAILURE;
}