makeSolverManagerTmpl (const Teuchos::RCP<Teuchos::ParameterList>& params)
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;

  RCP<SolverManagerType> solver = rcp (new SolverManagerType);

  // Some solvers may not like to get a null ParameterList.  If params
  // is null, replace it with an empty parameter list.  The solver
  // will fill in default parameters for that case.  Use the name of
  // the solver's default parameters to name the new empty list.
  RCP<ParameterList> pl;
  if (params.is_null()) {
    pl = parameterList (solver->getValidParameters ()->name ());
  } else {
    pl = params;
    pl.is_null(), std::logic_error,
    "Belos::SolverFactory: ParameterList to pass to solver is null.  This "
    "should never happen.  Please report this bug to the Belos developers.");
  solver->setParameters (pl);
  return solver;
void DefaultLinearSolverBuilder::readParameters( std::ostream *out )
  using Teuchos::parameterList;
  using Teuchos::ptr;
  using Teuchos::updateParametersFromXmlFile;
  using Teuchos::updateParametersFromXmlString;
  using std::endl;
  if (!paramList_.get()) {
    paramList_ = parameterList("DefaultLinearSolverBuilder");
  if (paramsXmlFileName().length()) {
    if (out) {
      *out << endl << "Reading parameters from XML file \"" 
           << paramsXmlFileName() << "\" ..." << endl;
    updateParametersFromXmlFile (paramsXmlFileName (), paramList_.ptr());
  if (extraParamsXmlString().length()) {
    if (out) {
      *out << endl << "Appending extra parameters from the XML string \""
           << extraParamsXmlString() << "\" ..." << endl;
    updateParametersFromXmlString (extraParamsXmlString (), paramList_.ptr());
      /// \brief Return a valid parameter list for verifying Tsqr.
      /// Call this once to get a valid parameter list with all the
      /// defaults filled in.  This list is valid for all the Scalar
      /// types which TsqrVerifierCaller::run tests.
      Teuchos::RCP<const Teuchos::ParameterList>
      getValidParameterList () const
        using Teuchos::ParameterList;
        using Teuchos::parameterList;
        using Teuchos::RCP;

        RCP<ParameterList> plist = parameterList ("FullTsqrVerifier");

        const size_t cacheSizeHint = 0;
        const int numCores = 1;
        const ordinal_type numRowsLocal = 100;
        const ordinal_type numCols = 10;
        const bool contiguousCacheBlocks = false;
        const bool testFactorExplicit = true;
        const bool testRankRevealing = true;
        const bool printFieldNames = true;
        const bool printResults = true;
        const bool failIfInaccurate = true;
        const bool debug = false;

        // Parameters for configuring Tsqr itself.
        plist->set ("cacheSizeHint", cacheSizeHint,
                    "Cache size hint in bytes.  "
                    "Zero means TSQR picks a reasonable default.");
        plist->set ("numCores", numCores,
                    "Number of partition(s) to use for TbbTsqr (if "
                    "applicable).  Must be a positive integer.");

        // Parameters for testing Tsqr.
        plist->set ("numRowsLocal", numRowsLocal,
                    "Number of rows per (MPI) process in the test matrix.  "
                    "Must be >= the number of columns.");
        plist->set ("numCols", numCols,
                    "Number of columns in the test matrix.");
        plist->set ("contiguousCacheBlocks", contiguousCacheBlocks,
                    "Whether to test the factorization with contiguously "
                    "stored cache blocks.");
        plist->set ("testFactorExplicit", testFactorExplicit,
                    "Whether to test TSQR's factorExplicit() (a hopefully "
                    "faster path than calling factor() and explicit_Q() in "
        plist->set ("testRankRevealing", testRankRevealing,
                    "Whether to test TSQR's rank-revealing capability.");
        plist->set ("printFieldNames", printFieldNames,
                    "Whether to print field names (this is only done once, "
                    "for all Scalar types tested).");
        plist->set ("printResults", printResults,
                    "Whether to print test results.");
        plist->set ("failIfInaccurate", failIfInaccurate,
                    "Whether to fail the test if the factorization "
                    "is not sufficiently accurate.");
        plist->set ("debug", debug,
                    "Whether to print debugging output.");
        return plist;
    setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
        using Teuchos::ParameterList;
        using Teuchos::parameterList;
        using Teuchos::RCP;
        using Teuchos::sublist;

        RCP<ParameterList> params = plist.is_null() ?
                                    parameterList (*getValidParameters ()) : plist;
        nodeTsqr_->setParameterList (sublist (params, "NodeTsqr"));
        distTsqr_->setParameterList (sublist (params, "DistTsqr"));

        this->setMyParamList (params);
    Teuchos::RCP<const Teuchos::ParameterList>
    getValidParameters () const
        using Teuchos::RCP;
        using Teuchos::rcp;
        using Teuchos::ParameterList;
        using Teuchos::parameterList;

        if (defaultParams_.is_null()) {
            RCP<ParameterList> params = parameterList ("TSQR implementation");
            params->set ("NodeTsqr", *(nodeTsqr_->getValidParameters ()));
            params->set ("DistTsqr", *(distTsqr_->getValidParameters ()));
            defaultParams_ = params;
        return defaultParams_;
  Teuchos::RCP<const Teuchos::ParameterList>
  MinresSolMgr<ScalarType, MV, OP>::defaultParameters()
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;
    using Teuchos::rcp;
    using Teuchos::rcpFromRef;
    using Teuchos::EnhancedNumberValidator;
    typedef MagnitudeType MT;
    typedef Teuchos::ScalarTraits<MT> MST;

    // List of parameters accepted by MINRES, and their default values.
    RCP<ParameterList> pl = parameterList ("MINRES");

    pl->set ("Convergence Tolerance", MST::squareroot (MST::eps()),
	     "Relative residual tolerance that needs to be achieved by "
	     "the iterative solver, in order for the linear system to be "
	     "declared converged.",
	     rcp (new EnhancedNumberValidator<MT> (MST::zero(), MST::rmax())));
    pl->set ("Maximum Iterations", static_cast<int>(1000),
	     "Maximum number of iterations allowed for each right-hand "
	     "side solved.",
	     rcp (new EnhancedNumberValidator<int> (0, INT_MAX)));
    pl->set ("Block Size", static_cast<int>(1),
	     "Number of vectors in each block.  WARNING: The current "
	     "implementation of MINRES only accepts a block size of 1, "
	     "since it can only solve for 1 right-hand side at a time.",
	     rcp (new EnhancedNumberValidator<int> (1, 1)));
    pl->set ("Verbosity", (int) Belos::Errors,
	     "The type(s) of solver information that should "
	     "be written to the output stream.");
    pl->set ("Output Style", (int) Belos::General,
	     "What style is used for the solver information written "
	     "to the output stream.");
    pl->set ("Output Frequency", static_cast<int>(-1),
	     "How often (in terms of number of iterations) intermediate "
	     "convergence information should be written to the output stream."
	     "  -1 means never.");
    pl->set ("Output Stream", rcpFromRef(std::cout),
	     "A reference-counted pointer to the output stream where all "
	     "solver output is sent.  The output stream defaults to stdout.");
    pl->set ("Timer Label", std::string("Belos"),
	     "The string to use as a prefix for the timer labels.");
    return pl;
 RCP<typename Kokkos::DefaultKernels<float,int,Node>::SparseOps::template bind_scalar<float>::other_type>
 gen_prob(RCP<Node> node, int N, size_t &totalNNZ)
   typedef typename Kokkos::DefaultKernels<float,int,Node>::SparseOps   DSM;
   typedef typename DSM::template bind_scalar<float>::other_type       fDSM;
   typedef typename fDSM::template graph<int,Node>::graph_type                  GRPH;
   typedef typename fDSM::template matrix<float,int,Node>::matrix_type           MAT;
   // generate symmetric tridiagonal matrix
   RCP<GRPH> G = rcp(new GRPH(N,N,node,null));
   RCP<MAT>  A= rcp(new MAT(G,null));
   // allocate buffers for offsets, indices and values
   totalNNZ = 3*N - 2;
   ArrayRCP<size_t> offsets(N+1);
   ArrayRCP<int>    inds(totalNNZ);
   ArrayRCP<float>  vals(totalNNZ);
     size_t NNZsofar = 0;
     offsets[0] = NNZsofar;
     inds[NNZsofar] = 0; inds[NNZsofar+1] =  1;
     vals[NNZsofar] = 2; vals[NNZsofar+1] = -1;
     NNZsofar += 2;
     for (int i=1; i != N-1; ++i) {
       offsets[i] = NNZsofar;
       inds[NNZsofar] = i-1; inds[NNZsofar+1] = i; inds[NNZsofar+2] = i+1;
       vals[NNZsofar] =  -1; vals[NNZsofar+1] = 2; vals[NNZsofar+2] =  -1;
       NNZsofar += 3;
     offsets[N-1] = NNZsofar;
     inds[NNZsofar] = N-2; inds[NNZsofar+1] = N-1;
     vals[NNZsofar] =  -1; vals[NNZsofar+1] = 2;
     NNZsofar += 2;
     offsets[N]   = NNZsofar;
   G->setStructure(offsets, inds);
   offsets = Teuchos::null;
   inds    = Teuchos::null;
   vals    = Teuchos::null;
   RCP<fDSM> dsm = rcp(new fDSM(node));
   return dsm;
ファイル: MultiPrecDriver.hpp プロジェクト: 00liujj/trilinos
  void run(Teuchos::ParameterList &myMachPL, const Teuchos::RCP<const Teuchos::Comm<int> > &comm, const Teuchos::RCP<Node> &node) 
    using std::pair;
    using std::make_pair;
    using std::plus;
    using std::endl;
    using Teuchos::null;
    using Teuchos::RCP;
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using TpetraExamples::make_pair_op;
    using Tpetra::RTI::reductionGlob;
    using Tpetra::RTI::ZeroOp;
    using Tpetra::RTI::binary_pre_transform_reduce;
    using Tpetra::RTI::binary_transform;

    // Static types
    typedef typename MPStack::type   S;
    typedef int                     LO;
    typedef int                     GO;
    typedef Tpetra::Map<LO,GO,Node>               Map;
    typedef Tpetra::CrsMatrix<S,LO,GO,Node> CrsMatrix;
    typedef Tpetra::Vector<S,LO,GO,Node>       Vector;

    *out << "Running test with Node==" << Teuchos::typeName(*node) << " on rank " << comm->getRank() << "/" << comm->getSize() << std::endl;

    // read the matrix
    RCP<CrsMatrix> A;
    RCP<const Map> rowMap = null;
    RCP<ParameterList> fillParams = parameterList();
    fillParams->set("Preserve Local Graph",true);
    // must preserve the local graph in order to do convert() calls later
    rowMap = A->getRowMap();

    // init the solver stack
    TpetraExamples::RFPCGInit<S,LO,GO,Node> init(A);
    RCP<ParameterList> db = Tpetra::Ext::initStackDB<MPStack>(*params,init);

    testPassed = true;

    // choose a solution, compute a right-hand-side
    auto x = Tpetra::createVector<S>(rowMap),
         b = Tpetra::createVector<S>(rowMap);
      // init the rhs
      auto bx = db->get<RCP<Vector>>("bx");
      binary_transform( *bx, *b, [](S, S bi) {return bi;}); // bx = b

    // call the solve

    // check that residual is as requested
      auto xhat = db->get<RCP<Vector>>("bx"),
           bhat = Tpetra::createVector<S>(rowMap);
      // compute bhat-b, while simultaneously computing |bhat-b|^2 and |b|^2
      auto nrms = binary_pre_transform_reduce(*bhat, *b, 
                                                [](S bhati, S bi){ return bi-bhati;}, // bhati = bi-bhat
                                                [](S bhati, S bi){ return make_pair(bhati*bhati, bi*bi); },
                                                make_pair_op<S,S>(plus<S>())) );
      const S enrm = Teuchos::ScalarTraits<S>::squareroot(nrms.first),
              bnrm = Teuchos::ScalarTraits<S>::squareroot(nrms.second);
      // check that residual is as requested
      *out << "|b - A*x|/|b|: " << enrm / bnrm << endl;
      const double tolerance = db->get<double>("tolerance");
      if (MPStack::bottom) {
        // give a little slack
        if (enrm / bnrm > 5*tolerance) testPassed = false;
      else {
        if (enrm / bnrm > tolerance) testPassed = false;

    // solve again, with the unfused version, just for timings purposes
    if (unfusedTest) 
      // init the rhs
      auto bx = db->get<RCP<Vector>>("bx");
      binary_transform( *bx, *b, [](S, S bi) {return bi;}); // bx = b
      // call the solve
      // test the result
      auto xhat = db->get<RCP<Vector>>("bx"),
           bhat = Tpetra::createVector<S>(rowMap);
      // compute bhat-b, while simultaneously computing |bhat-b|^2 and |b|^2
      auto nrms = binary_pre_transform_reduce(*bhat, *b, 
                                                [](S bhati, S bi){ return bi-bhati;}, // bhati = bi-bhat
                                                [](S bhati, S bi){ return make_pair(bhati*bhati, bi*bi); },
                                                make_pair_op<S,S>(plus<S>())) );
      const S enrm = Teuchos::ScalarTraits<S>::squareroot(nrms.first),
              bnrm = Teuchos::ScalarTraits<S>::squareroot(nrms.second);
      // check that residual is as requested
      *out << "|b - A*x|/|b|: " << enrm / bnrm << endl;
      const double tolerance = db->get<double>("tolerance");
      if (MPStack::bottom) {
        // give a little slack
        if (enrm / bnrm > 5*tolerance) testPassed = false;
      else {
        if (enrm / bnrm > tolerance) testPassed = false;
    // print timings
    Teuchos::TimeMonitor::summarize( *out );
/// \fn main
/// \brief Benchmark driver for (Mat)OrthoManager subclasses
main (int argc, char *argv[])
  using Belos::OrthoManager;
  using Belos::OrthoManagerFactory;
  using Belos::OutputManager;
  using Teuchos::CommandLineProcessor;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;

  Tpetra::ScopeGuard tpetraScope (&argc, &argv);

  bool success = false;
  bool verbose = false; // Verbosity of output
  try {
    RCP<const Teuchos::Comm<int> > pComm = Tpetra::getDefaultComm();
    // This factory object knows how to make a (Mat)OrthoManager
    // subclass, given a name for the subclass.  The name is not the
    // same as the class' syntactic name: e.g., "TSQR" is the name of
    // TsqrOrthoManager.
    OrthoManagerFactory<scalar_type, MV, OP> factory;

    // The name of the (Mat)OrthoManager subclass to instantiate.
    std::string orthoManName (factory.defaultName());

    // For SimpleOrthoManager: the normalization method to use.  Valid
    // values: "MGS", "CGS".
    std::string normalization ("CGS");

    // Name of the Harwell-Boeing sparse matrix file from which to read
    // the inner product operator matrix.  If name is "" or not provided
    // at the command line, use the standard Euclidean inner product.
    std::string filename;

    bool debug = false;   // Whether to print debugging-level output
    // Whether or not to run the benchmark.  If false, we let this
    // "test" pass trivially.
    bool benchmark = false;

    // Whether to display benchmark results compactly (in a CSV format),
    // or in a human-readable table.
    bool displayResultsCompactly = false;

    // Default _local_ (per MPI process) number of rows.  This will
    // change if a sparse matrix is loaded in as an inner product
    // operator.  Regardless, the number of rows per MPI process must be
    // no less than numCols*numBlocks in order for TSQR to work.  To
    // ensure that the test always passes with default parameters, we
    // scale by the number of processes.  The default value below may be
    // changed by a command-line parameter with a corresponding name.
    int numRowsPerProcess = 100;

    // The OrthoManager is benchmarked with numBlocks multivectors of
    // width numCols each, for numTrials trials.  The values below are
    // defaults and may be changed by the corresponding command-line
    // arguments.
    int numCols = 10;
    int numBlocks = 5;
    int numTrials = 3;

    CommandLineProcessor cmdp (false, true);
    cmdp.setOption ("benchmark", "nobenchmark", &benchmark,
        "Whether to run the benchmark.  If not, this \"test\" "
        "passes trivially.");
    cmdp.setOption ("verbose", "quiet", &verbose,
        "Print messages and results.");
    cmdp.setOption ("debug", "nodebug", &debug,
        "Print debugging information.");
    cmdp.setOption ("compact", "human", &displayResultsCompactly,
        "Whether to display benchmark results compactly (in a "
        "CSV format), or in a human-readable table.");
    cmdp.setOption ("filename", &filename,
        "Filename of a Harwell-Boeing sparse matrix, used as the "
        "inner product operator by the orthogonalization manager."
        "  If not provided, no matrix is read and the Euclidean "
        "inner product is used.");
      std::ostringstream os;
      const int numValid = factory.numOrthoManagers();
      const bool plural = numValid > 1 || numValid == 0;

      os << "OrthoManager subclass to benchmark.  There ";
      os << (plural ? "are " : "is ") << numValid << (plural ? "s: " : ": ");
      factory.printValidNames (os);
      os << ".  If none is provided, the test trivially passes.";
      cmdp.setOption ("ortho", &orthoManName, os.str().c_str());
    cmdp.setOption ("normalization", &normalization,
        "For SimpleOrthoManager (--ortho=Simple): the normalization "
        "method to use.  Valid values: \"MGS\", \"CGS\".");
    cmdp.setOption ("numRowsPerProcess", &numRowsPerProcess,
        "Number of rows per MPI process in the test multivectors.  "
        "If an input matrix is given, this value is ignored, since "
        "the vectors must be commensurate with the dimensions of "
        "the matrix.");
    cmdp.setOption ("numCols", &numCols,
        "Number of columns in the input multivector (>= 1).");
    cmdp.setOption ("numBlocks", &numBlocks,
        "Number of block(s) to benchmark (>= 1).");
    cmdp.setOption ("numTrials", &numTrials,
        "Number of trial(s) per timing run (>= 1).");

    // Parse the command-line arguments.
      const CommandLineProcessor::EParseCommandLineReturn parseResult = cmdp.parse (argc,argv);
      // If the caller asks us to print the documentation, or does not
      // explicitly say to run the benchmark, we let this "test" pass
      // trivially.
      if (! benchmark || parseResult == CommandLineProcessor::PARSE_HELP_PRINTED)
        if (Teuchos::rank(*pComm) == 0)
          std::cout << "End Result: TEST PASSED" << endl;
        return EXIT_SUCCESS;
      TEUCHOS_TEST_FOR_EXCEPTION(parseResult != CommandLineProcessor::PARSE_SUCCESSFUL,
          "Failed to parse command-line arguments");

    // Total number of rows in the test vector(s).
    // This may be changed if we load in a sparse matrix.
    int numRows = numRowsPerProcess * pComm->getSize();
    // Validate command-line arguments
    TEUCHOS_TEST_FOR_EXCEPTION(numRowsPerProcess <= 0, std::invalid_argument,
        "numRowsPerProcess <= 0 is not allowed");
    TEUCHOS_TEST_FOR_EXCEPTION(numCols <= 0, std::invalid_argument,
        "numCols <= 0 is not allowed");
    TEUCHOS_TEST_FOR_EXCEPTION(numBlocks <= 0, std::invalid_argument,
        "numBlocks <= 0 is not allowed");

    // Declare an output manager for handling local output.  Initialize,
    // using the caller's desired verbosity level.
    RCP<OutputManager<scalar_type> > outMan =
      Belos::Test::makeOutputManager<scalar_type> (verbose, debug);

    // Stream for debug output.  If debug output is not enabled, then
    // this stream doesn't print anything sent to it (it's a "black
    // hole" stream).
    std::ostream& debugOut = outMan->stream(Belos::Debug);
    Belos::Test::printVersionInfo (debugOut);

    // Load the inner product operator matrix from the given filename.
    // If filename == "", use the identity matrix as the inner product
    // operator (the Euclidean inner product), and leave M as
    // Teuchos::null.  Also return an appropriate Map (which will
    // always be initialized; it should never be Teuchos::null).
    RCP<map_type> map;
    RCP<sparse_matrix_type> M;
      using Belos::Test::loadSparseMatrix;
      // If the sparse matrix is loaded successfully, this call will
      // modify numRows to be the total number of rows in the sparse
      // matrix.  Otherwise, it will leave numRows alone.
      std::pair<RCP<map_type>, RCP<sparse_matrix_type> > results =
        loadSparseMatrix<local_ordinal_type, global_ordinal_type, node_type> (pComm, filename, numRows, debugOut);
      map = results.first;
      M = results.second;
    TEUCHOS_TEST_FOR_EXCEPTION(map.is_null(), std::logic_error,
        "Error: (Mat)OrthoManager test code failed to "
        "initialize the Map");
    if (M.is_null())
      // Number of rows per process has to be >= number of rows.
      TEUCHOS_TEST_FOR_EXCEPTION(numRowsPerProcess <= numCols,
          "numRowsPerProcess <= numCols is not allowed");
    // Loading the sparse matrix may have changed numRows, so check
    // again that the number of rows per process is >= numCols.
    // getNodeNumElements() returns a size_t, which is unsigned, and you
    // shouldn't compare signed and unsigned values.
    if (map->getNodeNumElements() < static_cast<size_t>(numCols))
      std::ostringstream os;
      os << "The number of elements on this process " << pComm->getRank()
        << " is too small for the number of columns that you want to test."
        << "  There are " << map->getNodeNumElements() << " elements on "
        "this process, but the normalize() method of the MatOrthoManager "
        "subclass will need to process a multivector with " << numCols
        << " columns.  Not all MatOrthoManager subclasses can handle a "
        "local row block with fewer rows than columns.";
      // QUESTION (mfh 26 Jan 2011) Should this be a logic error
      // instead?  It's really TSQR's fault that it can't handle a
      // local number of elements less than the number of columns.
      throw std::invalid_argument(os.str());

    // Using the factory object, instantiate the specified OrthoManager
    // subclass to be tested.  Specify "fast" parameters for a fair
    // benchmark comparison, but override the fast parameters to get the
    // desired normalization method for SimpleOrthoManaager.
    RCP<OrthoManager<scalar_type, MV> > orthoMan;
      std::string label (orthoManName);
      RCP<ParameterList> params =
        parameterList (*(factory.getFastParameters (orthoManName)));
      if (orthoManName == "Simple") {
        params->set ("Normalization", normalization);
        label = label + " (" + normalization + " normalization)";
      orthoMan = factory.makeOrthoManager (orthoManName, M, outMan, label, params);

    // "Prototype" multivector.  The test code will use this (via
    // Belos::MultiVecTraits) to clone other multivectors as necessary.
    // (This means the test code doesn't need the Map, and it also makes
    // the test code independent of the idea of a Map.)  We only have to
    // allocate one column, because the entries are S are not even read.
    // (We could allocate zero columns, if the MV object allows it.  We
    // play it safe and allocate 1 column instead.)
    RCP<MV> X = rcp (new MV (map, 1));

    // "Compact" mode means that we have to override
    // TimeMonitor::summarize(), which both handles multiple MPI
    // processes correctly (only Rank 0 prints to std::cout), and prints
    // verbosely in a table form.  We deal with the former by making an
    // ostream which is std::cout on Rank 0, and prints nothing (is a
    // "bit bucket") elsewhere.  We deal with the latter inside the
    // benchmark itself.
    Teuchos::oblackholestream bitBucket;
    std::ostream& resultStream =
      (displayResultsCompactly && Teuchos::rank(*pComm) != 0) ? bitBucket : std::cout;

    // Benchmark the OrthoManager subclass.
    typedef Belos::Test::OrthoManagerBenchmarker<scalar_type, MV> benchmarker_type;
    benchmarker_type::benchmark (orthoMan, orthoManName, normalization, X,
        numCols, numBlocks, numTrials,
        outMan, resultStream, displayResultsCompactly);

    success = true;

    // Only Rank 0 gets to write to cout.
    if (Teuchos::rank(*pComm) == 0)
      std::cout << "End Result: TEST PASSED" << endl;
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
  const EOpTransp M_trans,
  const MultiVectorBase<Scalar> &B,
  const Ptr<MultiVectorBase<Scalar> > &X,
  const Ptr<const SolveCriteria<Scalar> > solveCriteria
  ) const

  THYRA_FUNC_TIME_MONITOR("Stratimikos: BelosLOWS");

  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::rcpFromPtr;
  using Teuchos::FancyOStream;
  using Teuchos::OSTab;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::describe;
  typedef Teuchos::ScalarTraits<Scalar> ST;
  typedef typename ST::magnitudeType ScalarMag;
  Teuchos::Time totalTimer(""), timer("");

  assertSolveSupports(*this, M_trans, solveCriteria);
  // 2010/08/22: rabartl: Bug 4915 ToDo: Move the above into the NIV function
  // solve(...).

  const RCP<FancyOStream> out = this->getOStream();
  const Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel();
  OSTab tab = this->getOSTab();
  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)) {
    *out << "\nStarting iterations with Belos:\n";
    OSTab tab2(out);
    *out << "Using forward operator = " << describe(*fwdOpSrc_->getOp(),verbLevel);
    *out << "Using iterative solver = " << describe(*iterativeSolver_,verbLevel);
    *out << "With #Eqns="<<B.range()->dim()<<", #RHSs="<<B.domain()->dim()<<" ...\n";

  // Set RHS and LHS

  bool ret = lp_->setProblem( rcpFromPtr(X), rcpFromRef(B) );
    ret == false, CatastrophicSolveFailure
    ,"Error, the Belos::LinearProblem could not be set for the current solve!"

  // Set the solution criteria

  // Parameter list for the current solve.
  const RCP<ParameterList> tmpPL = Teuchos::parameterList();

  // The solver's valid parameter list.
  RCP<const ParameterList> validPL = iterativeSolver_->getValidParameters();

  SolveMeasureType solveMeasureType;
  RCP<GeneralSolveCriteriaBelosStatusTest<Scalar> > generalSolveCriteriaBelosStatusTest;
  if (nonnull(solveCriteria)) {
    solveMeasureType = solveCriteria->solveMeasureType;
    const ScalarMag requestedTol = solveCriteria->requestedTol;
    if (solveMeasureType.useDefault()) {
      tmpPL->set("Convergence Tolerance", defaultTol_);
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      setResidualScalingType (tmpPL, validPL, "Norm of RHS");
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      setResidualScalingType (tmpPL, validPL, "Norm of Initial Residual");
    else {
      // Set the most generic (and inefficient) solve criteria
      generalSolveCriteriaBelosStatusTest = createGeneralSolveCriteriaBelosStatusTest(
        *solveCriteria, convergenceTestFrequency_);
      // Set the verbosity level (one level down)
      generalSolveCriteriaBelosStatusTest->setVerbLevel(incrVerbLevel(verbLevel, -1));
      // Set the default convergence tolerance to always converged to allow
      // the above status test to control things.
      tmpPL->set("Convergence Tolerance", 1.0);
    // maximum iterations
    if (nonnull(solveCriteria->extraParameters)) {
      if (Teuchos::isParameterType<int>(*solveCriteria->extraParameters,"Maximum Iterations")) {
        tmpPL->set("Maximum Iterations", Teuchos::get<int>(*solveCriteria->extraParameters,"Maximum Iterations"));
  else {
    // No solveCriteria was even passed in!
    tmpPL->set("Convergence Tolerance", defaultTol_);

  // Solve the linear system

  Belos::ReturnType belosSolveStatus;
      outUsed =
      ( static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)
        ? out
        : rcp(new FancyOStream(rcp(new Teuchos::oblackholestream())))
    Teuchos::OSTab tab1(outUsed,1,"BELOS");
    tmpPL->set("Output Stream", outUsed);
    if (nonnull(generalSolveCriteriaBelosStatusTest)) {
    belosSolveStatus = iterativeSolver_->solve();

  // Report the solve status


  SolveStatus<Scalar> solveStatus;

  switch (belosSolveStatus) {
    case Belos::Unconverged: {
      solveStatus.solveStatus = SOLVE_STATUS_UNCONVERGED;
      // Set achievedTol even if the solver did not converge.  This is
      // helpful for things like nonlinear solvers, which might be
      // able to use a partially converged result, and which would
      // like to know the achieved convergence tolerance for use in
      // computing bounds.  It's also helpful for estimating whether a
      // small increase in the maximum iteration count might be
      // helpful next time.
      try {
	// Some solvers might not have implemented achievedTol(). 
	// The default implementation throws std::runtime_error.
	solveStatus.achievedTol = iterativeSolver_->achievedTol();
      } catch (std::runtime_error&) {
	// Do nothing; use the default value of achievedTol.
    case Belos::Converged: {
      solveStatus.solveStatus = SOLVE_STATUS_CONVERGED;
      if (nonnull(generalSolveCriteriaBelosStatusTest)) {
	// The user set a custom status test.  This means that we
	// should ask the custom status test itself, rather than the
	// Belos solver, what the final achieved convergence tolerance
	// was.
        const ArrayView<const ScalarMag> achievedTol = 
        solveStatus.achievedTol = ST::zero();
        for (Ordinal i = 0; i < achievedTol.size(); ++i) {
          solveStatus.achievedTol = std::max(solveStatus.achievedTol, achievedTol[i]);
      else {
	try {
	  // Some solvers might not have implemented achievedTol(). 
	  // The default implementation throws std::runtime_error.
	  solveStatus.achievedTol = iterativeSolver_->achievedTol();
	} catch (std::runtime_error&) {
	  // Use the default convergence tolerance.  This is a correct
	  // upper bound, since we did actually converge.
	  solveStatus.achievedTol = tmpPL->get("Convergence Tolerance", defaultTol_);

  std::ostringstream ossmessage;
    << "The Belos solver of type \""<<iterativeSolver_->description()
    <<"\" returned a solve status of \""<< toString(solveStatus.solveStatus) << "\""
    << " in " << iterativeSolver_->getNumIters() << " iterations"
    << " with total CPU time of " << totalTimer.totalElapsedTime() << " sec" ;
  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE))
    *out << "\n" << ossmessage.str() << "\n";

  solveStatus.message = ossmessage.str();

  // Dump the getNumIters() and the achieved convergence tolerance
  // into solveStatus.extraParameters, as the "Belos/Iteration Count"
  // resp. "Belos/Achieved Tolerance" parameters.
  if (solveStatus.extraParameters.is_null()) {
    solveStatus.extraParameters = parameterList ();
  solveStatus.extraParameters->set ("Belos/Iteration Count", 
  // package independent version of the same
  solveStatus.extraParameters->set ("Iteration Count", 
  // NOTE (mfh 13 Dec 2011) Though the most commonly used Belos
  // solvers do implement achievedTol(), some Belos solvers currently
  // do not.  In the latter case, if the solver did not converge, the
  // reported achievedTol() value may just be the default "invalid"
  // value -1, and if the solver did converge, the reported value will
  // just be the convergence tolerance (a correct upper bound).
  solveStatus.extraParameters->set ("Belos/Achieved Tolerance", 

//  This information is in the previous line, which is printed anytime the verbosity
//  is not set to Teuchos::VERB_NONE, so I'm commenting this out for now.
//  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE))
//    *out << "\nTotal solve time in Belos = "<<totalTimer.totalElapsedTime()<<" sec\n";
  return solveStatus;

    benchmarkKokkosNodeTsqr (const Teuchos::RCP<NodeType>& node,
                             const int numTrials,
                             const Ordinal numRows,
                             const Ordinal numCols,
                             const int numPartitions,
                             const size_t cacheSizeHint,
                             const bool contiguousCacheBlocks,
                             const bool printFieldNames,
                             const bool humanReadable)
      using Teuchos::ParameterList;
      using Teuchos::parameterList;
      using Teuchos::RCP;
      using Teuchos::TypeNameTraits;
      using std::cerr;
      using std::cout;
      using std::endl;
      typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
      typedef typename node_tsqr_type::FactorOutput factor_output_type;
      typedef Teuchos::ScalarTraits<Scalar> STS;
      // typedef typename STS::magnitudeType magnitude_type;
      typedef Teuchos::Time timer_type;
      typedef Matrix<Ordinal, Scalar> matrix_type;

      const std::string scalarTypeName = TypeNameTraits<Scalar>::name();

      // Pseudorandom normal(0,1) generator.  Default seed is OK,
      // because this is a benchmark, not an accuracy test.
      TSQR::Random::NormalGenerator<Ordinal, Scalar> gen;

      // Set up TSQR implementation.
      RCP<ParameterList> params = parameterList ("Intranode TSQR");
      params->set ("Cache Size Hint", cacheSizeHint);
      params->set ("Num Tasks", numPartitions);
      node_tsqr_type actor (params);
      actor.setNode (node);

      // Allocate space for test problem.
      matrix_type A (numRows, numCols);
      matrix_type A_copy (numRows, numCols);
      matrix_type Q (numRows, numCols);
      matrix_type R (numCols, numCols);

      // Fill R with zeros, since the factorization may not overwrite
      // the strict lower triangle of R.
      R.fill (STS::zero());

      // Create a test problem
      nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), false);

      // Copy A into A_copy, since TSQR overwrites the input.  If
      // specified, rearrange the data in A_copy so that the data in
      // each cache block is contiguously stored.
      if (contiguousCacheBlocks) {
        actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
      } else {
        deep_copy (A_copy, A);

      // Do a few timing runs and throw away the results, just to warm
      // up any libraries that do autotuning.
      const int numWarmupRuns = 5;
      for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun) {
        // Factor the matrix in-place in A_copy, and extract the
        // resulting R factor into R.
        factor_output_type factor_output =
          actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(),
                        R.get(), R.lda(), contiguousCacheBlocks);
        // Compute the explicit Q factor (which was stored
        // implicitly in A_copy and factor_output) and store in Q.
        // We don't need to un-cache-block the output, because we
        // aren't verifying it here.
        actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(),
                          factor_output, numCols, Q.get(), Q.lda(),

      // Benchmark intranode TSQR for numTrials trials.
      // Name of timer doesn't matter here; we only need the timing.
      timer_type timer("KokkosNodeTsqr");
      for (int trialNum = 0; trialNum < numTrials; ++trialNum) {
        // Factor the matrix in-place in A_copy, and extract the
        // resulting R factor into R.
        factor_output_type factor_output =
          actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(),
                        R.get(), R.lda(), contiguousCacheBlocks);
        // Compute the explicit Q factor (which was stored
        // implicitly in A_copy and factor_output) and store in Q.
        // We don't need to un-cache-block the output, because we
        // aren't verifying it here.
        actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(),
                          factor_output, numCols, Q.get(), Q.lda(),
      const double timing = timer.stop();

      // Print the results
      if (humanReadable) {
        cout << "KokkosNodeTsqr cumulative timings:" << endl
             << "Scalar type: " << scalarTypeName << endl
             << "# rows = " << numRows << endl
             << "# columns = " << numCols << endl
             << "# partitions: " << numPartitions << endl
             << "Cache size hint (in bytes) = " << actor.cache_size_hint() << endl
             << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
             << "# trials = " << numTrials << endl
             << "Total time (s) = " << timing << endl;
      else {
        if (printFieldNames) {
          const char prefix[] = "%";
          cout << prefix
               << "method"
               << ",scalarType"
               << ",numRows"
               << ",numCols"
               << ",numPartitions"
               << ",cacheSizeHint"
               << ",contiguousCacheBlocks"
               << ",numTrials"
               << ",timing"
               << endl;

        // We don't include {min,max}_seq_apply_timing() here, because
        // those times don't benefit from the accuracy of benchmarking
        // for numTrials > 1.  Thus, it's misleading to include them
        // with tbb_tsqr_timing, the total time over numTrials trials.
        cout << "KokkosNodeTsqr"
             << "," << scalarTypeName
             << "," << numRows
             << "," << numCols
             << "," << numPartitions
             << "," << actor.cache_size_hint()
             << "," << contiguousCacheBlocks
             << "," << numTrials
             << "," << timing
             << endl;
    verifyKokkosNodeTsqr (const Teuchos::RCP<NodeType>& node,
                          TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
                          const Ordinal numRows,
                          const Ordinal numCols,
                          const int numPartitions,
                          const size_t cacheSizeHint,
                          const bool contiguousCacheBlocks,
                          const bool printFieldNames,
                          const bool humanReadable,
                          const bool debug)
      using Teuchos::ParameterList;
      using Teuchos::parameterList;
      using Teuchos::RCP;
      using Teuchos::TypeNameTraits;
      using std::cerr;
      using std::cout;
      using std::endl;
      typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
      typedef typename node_tsqr_type::FactorOutput factor_output_type;
      typedef Teuchos::ScalarTraits<Scalar> STS;
      typedef typename STS::magnitudeType magnitude_type;
      // typedef Teuchos::Time timer_type;
      typedef Matrix<Ordinal, Scalar> matrix_type;
      typedef MatView<Ordinal, Scalar> mat_view_type;

      const std::string scalarTypeName = TypeNameTraits<Scalar>::name();

      // Set up TSQR implementation.
      RCP<ParameterList> params = parameterList ("Intranode TSQR");
      params->set ("Cache Size Hint", cacheSizeHint);
      params->set ("Num Tasks", numPartitions);
      node_tsqr_type actor (params);
      actor.setNode (node);
      if (debug)
          cerr << actor.description() << endl;
          if (contiguousCacheBlocks)
            cerr << "-- Test with contiguous cache blocks" << endl;

      // Allocate space for test problem.
      matrix_type A (numRows, numCols);
      matrix_type A_copy (numRows, numCols);
      matrix_type Q (numRows, numCols);
      matrix_type R (numCols, numCols);
      if (std::numeric_limits<Scalar>::has_quiet_NaN)
          A.fill (std::numeric_limits<Scalar>::quiet_NaN());
          A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
          Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
          R.fill (std::numeric_limits<Scalar>::quiet_NaN());
          A.fill (STS::zero());
          A_copy.fill (STS::zero());
          Q.fill (STS::zero());
          R.fill (STS::zero());
      const Ordinal lda = numRows;
      const Ordinal ldq = numRows;
      const Ordinal ldr = numCols;

      // Create a test problem
      nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), true);

      if (debug)
          cerr << "-- Generated test problem" << endl;
          // Don't print the matrix if it's too big.
          if (A.nrows() <= 30)
              cerr << "A = " << endl;
              print_local_matrix (cerr, A.nrows(), A.ncols(),
                                  A.get(), A.lda());
              cerr << endl << endl;

      // Copy A into A_copy, since TSQR overwrites the input.  If
      // specified, rearrange the data in A_copy so that the data in
      // each cache block is contiguously stored.
      if (! contiguousCacheBlocks) {
        deep_copy (A_copy, A);
        if (debug) {
          cerr << "-- Copied test problem from A into A_copy" << endl;
          // Don't print the matrix if it's too big.
          if (A_copy.nrows() <= 30) {
            cerr << "A_copy = " << endl;
            print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(),
                                A_copy.get(), A_copy.lda());
            cerr << endl << endl;
      else {
        actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
        if (debug) {
          cerr << "-- Reorganized test matrix to have contiguous "
            "cache blocks" << endl;
          // Don't print the matrix if it's too big.
          if (A_copy.nrows() <= 30) {
            cerr << "A_copy = " << endl;
            print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(),
                                A_copy.get(), A_copy.lda());
            cerr << endl << endl;

        // Verify cache blocking, when in debug mode.
        if (debug) {
          matrix_type A2 (numRows, numCols);
          if (std::numeric_limits<Scalar>::has_quiet_NaN) {
            A2.fill (std::numeric_limits<Scalar>::quiet_NaN());

          actor.un_cache_block (numRows, numCols, A2.get(), A2.lda(), A_copy.get());
          if (matrix_equal (A, A2)) {
            if (debug)
              cerr << "-- Cache blocking test succeeded!" << endl;
          else {
            if (debug) {
              cerr << "*** Cache blocking test failed! A != A2 ***"
                   << endl << endl;
              // Don't print the matrices if they are too big.
              if (A.nrows() <= 30 && A2.nrows() <= 30) {
                cerr << "A = " << endl;
                print_local_matrix (cerr, A.nrows(), A.ncols(),
                                    A.get(), A.lda());
                cerr << endl << "A2 = " << endl;
                print_local_matrix (cerr, A2.nrows(), A2.ncols(),
                                    A2.get(), A2.lda());
                cerr << endl;
            throw std::logic_error ("Cache blocking failed");

      // Fill R with zeros, since the factorization may not
      // necessarily overwrite the strict lower triangle of R.
      if (debug) {
        cerr << "-- Filling R with zeros" << endl;
      R.fill (STS::zero());

      if (debug) {
        cerr << "-- Calling factor()" << endl;

      // Factor the matrix and compute the explicit Q factor
      factor_output_type factor_output =
        actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(),
                      R.get(), R.lda(), contiguousCacheBlocks);
      if (debug) {
        cerr << "-- Finished factor()" << endl;
        cerr << "-- Calling explicit_Q()" << endl;

      // KokkosNodeTsqr isn't designed to be used by itself, so we
      // have to help it along by filling the top ncols x ncols
      // entries with the first ncols columns of the identity matrix.
        mat_view_type Q_top =
          actor.top_block (Q.view (), contiguousCacheBlocks);
        mat_view_type Q_top_square (Q_top.ncols(), Q_top.ncols(),
                                    Q_top.get(), Q_top.lda());
        Q_top_square.fill (STS::zero ());
        for (Ordinal j = 0; j < Q_top_square.ncols(); ++j) {
          Q_top_square(j,j) = STS::one ();
      actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(),
                        factor_output, numCols, Q.get(), Q.lda(),
      if (debug) {
        cerr << "-- Finished explicit_Q()" << endl;

      // "Un"-cache-block the output Q (the explicit Q factor), if
      // contiguous cache blocks were used.  This is only necessary
      // because local_verify() doesn't currently support contiguous
      // cache blocks.
      if (contiguousCacheBlocks) {
        // Use A_copy as temporary storage for un-cache-blocking Q.
        actor.un_cache_block (numRows, numCols, A_copy.get(),
                              A_copy.lda(), Q.get());
        deep_copy (Q, A_copy);
        if (debug) {
          cerr << "-- Un-cache-blocked output Q factor" << endl;

      // Print out the Q and R factors in debug mode.
      if (debug) {
        // Don't print the matrix if it's too big.
        if (Q.nrows() <= 30) {
          cerr << endl << "-- Q factor:" << endl;
          print_local_matrix (cerr, Q.nrows(), Q.ncols(),
                              Q.get(), Q.lda());
          cerr << endl << endl;
        cerr << endl << "-- R factor:" << endl;
        print_local_matrix (cerr, numCols, numCols, R.get(), R.lda());
        cerr << endl;

      // Validate the factorization
      std::vector<magnitude_type> results =
        local_verify (numRows, numCols, A.get(), lda,
                      Q.get(), ldq, R.get(), ldr);
      if (debug)
        cerr << "-- Finished local_verify" << endl;

      // Print the results
      if (humanReadable) {
        cout << "KokkosNodeTsqr:" << endl
             << "Scalar type: " << scalarTypeName << endl
             << "# rows: " << numRows << endl
             << "# columns: " << numCols << endl
             << "# partitions: " << numPartitions << endl
             << "cache size hint (revised) in bytes: " << actor.cache_size_hint() << endl
             << "contiguous cache blocks? " << contiguousCacheBlocks << endl
             << "Absolute residual $\\|A - Q*R\\|_2$: "
             << results[0] << endl
             << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: "
             << results[1] << endl
             << "Test matrix norm $\\| A \\|_F$: "
             << results[2] << endl
             << endl;
      else {
        if (printFieldNames) {
          const char prefix[] = "%";
          cout << prefix
               << "method"
               << ",scalarType"
               << ",numRows"
               << ",numCols"
               << ",numPartitions"
               << ",cacheSizeHint"
               << ",contiguousCacheBlocks"
               << ",absFrobResid"
               << ",absFrobOrthog"
               << ",frobA"
               << endl;
        cout << "KokkosNodeTsqr"
             << "," << scalarTypeName
             << "," << numRows
             << "," << numCols
             << "," << numPartitions
             << "," << actor.cache_size_hint()
             << "," << contiguousCacheBlocks
             << "," << results[0]
             << "," << results[1]
             << "," << results[2]
             << endl;
ファイル: IRTRDriver.hpp プロジェクト: 00liujj/trilinos
  void run(Teuchos::ParameterList &myMachPL, const Teuchos::RCP<const Teuchos::Comm<int> > &comm, const Teuchos::RCP<Node> &node) 
    using std::plus;
    using std::endl;
    using Teuchos::null;
    using Teuchos::RCP;
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Tpetra::RTI::ZeroOp;

    // Static types
    typedef int                     LO;
    typedef int                     GO;
    typedef Tpetra::Map<LO,GO,Node>               Map;
    typedef Tpetra::CrsMatrix<S,LO,GO,Node> CrsMatrix;
    typedef Tpetra::Vector<S,LO,GO,Node>       Vector;
    typedef Teuchos::ScalarTraits<S> ST;

    IRTRdetails::fpu_fix<S> ff; ff.fix();

    *out << "Running test with Node==" << Teuchos::typeName(*node) << " on rank " << comm->getRank() << "/" << comm->getSize() << std::endl;

    // read the matrix
    RCP<CrsMatrix> A;
    RCP<const Map> rowMap = null;
    RCP<ParameterList> fillParams = parameterList();
    // must preserve the local graph in order to do convert() calls later
    if (Teuchos::TypeTraits::is_same<S,Sinner>::value) {
      fillParams->set("Preserve Local Graph",false);
    else {
      fillParams->set("Preserve Local Graph",true);
    rowMap = A->getRowMap();

    testPassed = true;

    // compute an inital vector
    auto x = Tpetra::createVector<S>(rowMap);

    // call the solve
    S lambda = TpetraExamples::IRTR<Sinner,S,LO,GO,Node>(out,*params,A,x);

    // check that residual is as requested
      auto r = Tpetra::createVector<S>(rowMap);
      // compute A*x - x*lambda, while simultaneously computing |A*x - x*lambda|
      const S r_r = XFORM_REDUCE(r, x,                          // fused: 
                                 r - x*lambda,                  //      : r = r - x*lambda = A*x - x*lambda
                                 r*r, ZeroOp<S>, plus<S>() );   //      : sum r'*r
      const S rnrm = Teuchos::ScalarTraits<S>::squareroot(r_r);
      // check that residual is as requested
      *out << "|A*x - x*lambda|/|lambda|: " << rnrm / ST::magnitude(lambda) << endl;
      const double tolerance = params->get<double>("tolerance");
      if (rnrm / lambda > tolerance) testPassed = false;


    // print timings
    Teuchos::TimeMonitor::summarize( *out );
  bool& converged,
  int& numItersPerformed,
  const Teuchos::ScalarTraits<ST>::magnitudeType& tol,
  const int maxNumIters,
  const int num_steps,
  const Teuchos::RCP<CloneNode>& clone_node,
  const Teuchos::RCP<multivector_type>& X,
  const Teuchos::RCP<const sparse_matrix_type>& A,
  const Teuchos::RCP<const multivector_type>& B,
  const std::string& prec_type,
  const Teuchos::RCP<const operator_type>& M_left=Teuchos::null,
  const Teuchos::RCP<const operator_type>& M_right=Teuchos::null) {

  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp_dynamic_cast;

  typedef Tpetra::CrsMatrix<ST, LO, GO, CloneNode>    clone_sparse_matrix_type;
  typedef Tpetra::Operator<ST, LO, GO, CloneNode>     clone_operator_type;
  typedef Tpetra::MultiVector<ST, LO, GO, CloneNode>  clone_multi_vector_type;
  typedef typename KokkosClassic::DefaultKernels<ST,LO,CloneNode>::SparseOps clone_sparse_ops;
  typedef clone_multi_vector_type MV;
  typedef clone_operator_type OP;

  // Clone Matrix, RHS, LHS
  RCP<ParameterList> plClone = parameterList();
  RCP<clone_sparse_matrix_type> A_clone;
  RCP<clone_multi_vector_type> B_clone, X_clone;
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Clone System", clone_system);
    A_clone = A->clone(clone_node, plClone);
    B_clone = B->clone(clone_node);
    X_clone = X->clone(clone_node);

  // Clone preconditioner(s)
  RCP<const clone_operator_type> M_left_clone, M_right_clone;
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Clone Preconditioner", clone_prec);

    if (M_left != Teuchos::null && prec_type == "MueLu") {
      RCP< const MueLu::TpetraOperator<ST,LO,GO,Node> > M_muelu =
        rcp_dynamic_cast<const MueLu::TpetraOperator<ST,LO,GO,Node> >(M_left);
      M_left_clone = M_muelu->clone<CloneNode, clone_sparse_ops>(clone_node);
    if (M_right != Teuchos::null && prec_type == "MueLu") {
      RCP< const MueLu::TpetraOperator<ST,LO,GO,Node> > M_muelu =
        rcp_dynamic_cast<const MueLu::TpetraOperator<ST,LO,GO,Node> >(M_right);
      M_right_clone = M_muelu->clone<CloneNode, clone_sparse_ops>(clone_node);
      prec_type == "MueLu", std::runtime_error, "Tpetra scaling example: "
      "In order to precondition with MueLu, you must have built Trilinos "
      "with the MueLu package enabled.");

  // Solve
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Clone Solve", clone_solve);
    IntrepidPoissonExample::solveWithBelos<ST,MV,OP> (
      converged, numItersPerformed, tol, maxNumIters, num_steps,
      X_clone, A_clone, B_clone, M_left_clone, M_right_clone);

  // Copy X_clone back into X
    TEUCHOS_FUNC_TIME_MONITOR_DIFF("Clone Solution", clone_sol);
    RCP<multivector_type> X_host = X_clone->clone(X->getMap()->getNode());
    X->update(1.0, *X_host, 0.0);
Piro::RythmosSolver<Scalar>::RythmosSolver(Teuchos::RCP<Teuchos::ParameterList> in_appParams,
                          Teuchos::RCP< Thyra::ModelEvaluatorDefaultBase<Scalar> > in_model,
                          Teuchos::RCP<Rythmos::IntegrationObserverBase<Scalar> > in_observer) :
  // For dumping default parameters from Rythmos
    //Rythmos::IntegratorBuilder<double> b;
    //std::cout << *(b.getValidParameters()) << std::endl;
    //Teuchos::writeParameterListToXmlFile(*b.getValidParameters(), "sample.xml");

  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;

  out = Teuchos::VerboseObjectBase::getDefaultOStream();

  num_p = model->createInArgs().Np();
  num_g = model->createOutArgs().Ng();

//   TEUCHOS_TEST_FOR_EXCEPTION(num_p > 1, Teuchos::Exceptions::InvalidParameter,
//                      std::endl << "Error in Piro::RythmosSolver " <<
//                      "Not Implemented for Np>1 : " << num_p << std::endl);
//   TEUCHOS_TEST_FOR_EXCEPTION(num_g > 1, Teuchos::Exceptions::InvalidParameter,
//                      std::endl << "Error in Piro::RythmosSolver " <<
//                      "Not Implemented for Ng>1 : " << num_g << std::endl);

  *out << "\nA) Get the base parameter list ...\n";

  RCP<Teuchos::ParameterList> rythmosPL = sublist(appParams, "Rythmos", true);

    const std::string verbosity = rythmosPL->get("Verbosity Level", "VERB_DEFAULT");
    solnVerbLevel = Teuchos::VERB_DEFAULT;
    if      (verbosity == "VERB_NONE")    solnVerbLevel = Teuchos::VERB_NONE;
    else if (verbosity == "VERB_LOW")     solnVerbLevel = Teuchos::VERB_LOW;
    else if (verbosity == "VERB_MEDIUM")  solnVerbLevel = Teuchos::VERB_MEDIUM;
    else if (verbosity == "VERB_HIGH")    solnVerbLevel = Teuchos::VERB_HIGH;
    else if (verbosity == "VERB_EXTREME") solnVerbLevel = Teuchos::VERB_EXTREME;

  t_final = rythmosPL->get("Final Time", 0.1);
  const std::string stepperType = rythmosPL->get("Stepper Type", "Backward Euler");
  *out << "\nC) Create and initalize the forward model ...\n";
  *out << "\nD) Create the stepper and integrator for the forward problem ...\n";
  if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "Rythmos") {
    Teuchos::RCP<Rythmos::TimeStepNonlinearSolver<double> > rythmosTimeStepSolver = 
    if (rythmosPL->getEntryPtr("NonLinear Solver")) {
      RCP<Teuchos::ParameterList> nonlinePL =
	sublist(rythmosPL, "NonLinear Solver", true);
    fwdTimeStepSolver = rythmosTimeStepSolver;
  else if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "NOX") {
#ifdef Piro_ENABLE_NOX
    Teuchos::RCP<Thyra::NOXNonlinearSolver> nox_solver =  Teuchos::rcp(new Thyra::NOXNonlinearSolver);
    Teuchos::RCP<Teuchos::ParameterList> nox_params = Teuchos::rcp(new Teuchos::ParameterList);
    *nox_params = appParams->sublist("NOX");
    fwdTimeStepSolver = nox_solver;
    TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Requested NOX solver for a Rythmos Transient solve, Trilinos was not built with NOX enabled.  Please rebuild Trilinos or use the native Rythmos nonlinear solver.");
  if (stepperType == "Backward Euler") {
    fwdStateStepper = Rythmos::backwardEulerStepper<Scalar> (model, fwdTimeStepSolver);
    fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
  else if (stepperType == "Explicit RK") {
    fwdStateStepper = Rythmos::explicitRKStepper<Scalar>(model);
    fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
  else if (stepperType == "BDF") {
    Teuchos::RCP<Teuchos::ParameterList> BDFparams = 
      Teuchos::sublist(rythmosPL, "Rythmos Stepper", true);
    Teuchos::RCP<Teuchos::ParameterList> BDFStepControlPL =
      Teuchos::sublist(BDFparams,"Step Control Settings");
    fwdStateStepper = Teuchos::rcp( new Rythmos::ImplicitBDFStepper<Scalar>(model,fwdTimeStepSolver,BDFparams) );
    TEUCHOS_TEST_FOR_EXCEPTION( true, Teuchos::Exceptions::InvalidParameter,
				std::endl << "Error! Piro::Epetra::RythmosSolver: Invalid Steper Type: "
				<< stepperType << std::endl);
  // Step control strategy
    // If the stepper can accept a step control strategy, then attempt to build one.
    RCP<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> > scsa_stepper = 
      Teuchos::rcp_dynamic_cast<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> >(fwdStateStepper);

    if ( nonnull(scsa_stepper) ) {
      std::string step_control_strategy = rythmosPL->get("Step Control Strategy Type", "None");

      if (step_control_strategy == "None") {
	// don't do anything, stepper will build default
      else if (step_control_strategy == "ImplicitBDFRamping") {
	const RCP<Rythmos::ImplicitBDFStepperRampingStepControl<Scalar> > rscs = 
	  rcp(new Rythmos::ImplicitBDFStepperRampingStepControl<Scalar>);
	const RCP<ParameterList> p = parameterList(rythmosPL->sublist("Rythmos Step Control Strategy"));

      else {
	TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Error! Piro::Epetra::RythmosSolver: Invalid step control strategy type: " << step_control_strategy << std::endl);

      integrationControlPL = sublist(rythmosPL, "Rythmos Integration Control", true);
    RCP<Rythmos::DefaultIntegrator<Scalar> > defaultIntegrator;
    if (rythmosPL->get("Rythmos Integration Control Strategy", "Simple") == "Simple") {
      defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::simpleIntegrationControlStrategy<Scalar>(integrationControlPL));
    else if(rythmosPL->get<std::string>("Rythmos Integration Control Strategy") == "Ramping") {
      defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::rampingIntegrationControlStrategy<Scalar>(integrationControlPL));  

    fwdStateIntegrator = defaultIntegrator;

  fwdStateIntegrator->setParameterList(sublist(rythmosPL, "Rythmos Integrator", true));
  if (observer != Teuchos::null) 
  MinresSolMgr<ScalarType, MV, OP>::
  setParameters (const Teuchos::RCP<Teuchos::ParameterList>& params)
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;
    using Teuchos::rcp;
    using Teuchos::rcpFromRef;
    using Teuchos::null;
    using Teuchos::is_null;
    using std::string;
    using std::ostream;
    using std::endl;

    if (params_.is_null()) {
      params_ = parameterList (*getValidParameters());
    RCP<ParameterList> pl = params;
    pl->validateParametersAndSetDefaults (*params_);

    // Read parameters from the parameter list.  We have already
    // populated it with defaults.
    blockSize_ = pl->get<int> ("Block Size");
    verbosity_ = pl->get<int> ("Verbosity");
    outputStyle_ = pl->get<int> ("Output Style");
    outputFreq_ = pl->get<int>("Output Frequency");
    outputStream_ = pl->get<RCP<std::ostream> > ("Output Stream");
    convtol_ = pl->get<MagnitudeType> ("Convergence Tolerance");
    maxIters_ = pl->get<int> ("Maximum Iterations");
    // All done reading parameters from the parameter list.
    // Now we know it's valid and we can store it.
    params_ = pl;

    // Change the timer label, and create the timer if necessary.
    const string newLabel = pl->get<string> ("Timer Label");
      if (newLabel != label_ || timerSolve_.is_null()) {
	label_ = newLabel;
	const string solveLabel = label_ + ": MinresSolMgr total solve time";
	// Unregister the old timer before creating a new one.
	if (! timerSolve_.is_null()) {
	  Teuchos::TimeMonitor::clearCounter (label_);
	  timerSolve_ = Teuchos::null;
	timerSolve_ = Teuchos::TimeMonitor::getNewCounter (solveLabel);

    // Create output manager, if necessary; otherwise, set its parameters.
    bool recreatedPrinter = false;
    if (printer_.is_null()) {
      printer_ = rcp (new OutputManager<ScalarType> (verbosity_, outputStream_));
      recreatedPrinter = true;
    } else {
      // Set the output stream's verbosity level.
      printer_->setVerbosity (verbosity_);
      // Tell the output manager about the new output stream.
      printer_->setOStream (outputStream_);

    // Set up the convergence tests
    typedef StatusTestGenResNorm<ScalarType, MV, OP> res_norm_type;
    typedef StatusTestCombo<ScalarType, MV, OP> combo_type;

    // Do we need to allocate at least one of the implicit or explicit
    // residual norm convergence tests?
    const bool allocatedConvergenceTests =
      impConvTest_.is_null() || expConvTest_.is_null();

    // Allocate or set the tolerance of the implicit residual norm
    // convergence test.
    if (impConvTest_.is_null()) {
      impConvTest_ = rcp (new res_norm_type (convtol_));
      impConvTest_->defineResForm (res_norm_type::Implicit, TwoNorm);
      // TODO (mfh 03 Nov 2011) Allow users to define the type of
      // scaling (or a custom scaling factor).
      impConvTest_->defineScaleForm (NormOfInitRes, TwoNorm);
    } else {
      impConvTest_->setTolerance (convtol_);

    // Allocate or set the tolerance of the explicit residual norm
    // convergence test.
    if (expConvTest_.is_null()) {
      expConvTest_ = rcp (new res_norm_type (convtol_));
      expConvTest_->defineResForm (res_norm_type::Explicit, TwoNorm);
      // TODO (mfh 03 Nov 2011) Allow users to define the type of
      // scaling (or a custom scaling factor).
      expConvTest_->defineScaleForm (NormOfInitRes, TwoNorm);
    } else {
      expConvTest_->setTolerance (convtol_);

    // Whether we need to recreate the full status test.  We only need
    // to do that if at least one of convTest_ or maxIterTest_ had to
    // be reallocated.
    bool needToRecreateFullStatusTest = sTest_.is_null();

    // Residual status test is a combo of the implicit and explicit
    // convergence tests.
    if (convTest_.is_null() || allocatedConvergenceTests) {
      convTest_ = rcp (new combo_type (combo_type::SEQ, impConvTest_, expConvTest_));
      needToRecreateFullStatusTest = true;

    // Maximum number of iterations status test.  It tells the solver to
    // stop iteration, if the maximum number of iterations has been
    // exceeded.  Initialize it if we haven't yet done so, otherwise
    // tell it the new maximum number of iterations.
    if (maxIterTest_.is_null()) {
      maxIterTest_ = rcp (new StatusTestMaxIters<ScalarType,MV,OP> (maxIters_));
      needToRecreateFullStatusTest = true;
    } else {
      maxIterTest_->setMaxIters (maxIters_);

    // Create the full status test if we need to.
    // The full status test: the maximum number of iterations have
    // been reached, OR the residual has converged.
    // "If we need to" means either that the status test was never
    // created before, or that its two component tests had to be
    // reallocated.
    if (needToRecreateFullStatusTest) {
      sTest_ = rcp (new combo_type (combo_type::OR, maxIterTest_, convTest_));

    // If necessary, create the status test output class.  This class
    // manages and formats the output from the status test.  We have
    // to recreate the output test if we had to (re)allocate either
    // printer_ or sTest_.
    if (outputTest_.is_null() || needToRecreateFullStatusTest || recreatedPrinter) {
      StatusTestOutputFactory<ScalarType,MV,OP> stoFactory (outputStyle_);
      outputTest_ = stoFactory.create (printer_, sTest_, outputFreq_,
    } else {
      outputTest_->setOutputFrequency (outputFreq_);
    // Set the solver string for the output test.
    // StatusTestOutputFactory has no constructor argument for this.
    outputTest_->setSolverDesc (std::string (" MINRES "));

    // Inform the solver manager that the current parameters were set.
    parametersSet_ = true;

    if (verbosity_ & Debug) {
      using std::endl;

      std::ostream& dbg = printer_->stream (Debug);
      dbg << "MINRES parameters:" << endl << params_ << endl;
  // Test correct quoting of labels for TimeMonitor's YAML output.
  TEUCHOS_UNIT_TEST( TimeMonitor, YamlLabelQuoting )
    using Teuchos::Array;
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;
    using Teuchos::Time;
    typedef Array<std::string>::size_type size_type;

    Array<std::string> inputLabels, outputLabels;

    // Make sure to exercise things that don't need quoting, like
    // spaces and certain punctuation, as well as things that do need
    // quoting, like colons, inner double quotes, and backslashes.
    inputLabels.push_back ("NoQuotingNeeded");
    inputLabels.push_back ("No quoting needed");
    inputLabels.push_back ("\"AlreadyQuotedNoQuotingNeeded\"");
    inputLabels.push_back ("\"Already quoted, no quoting needed\"");
    inputLabels.push_back ("\"Already quoted: quoting needed\"");
    inputLabels.push_back ("NotQuoted:QuotingNeeded");
    inputLabels.push_back ("Not quoted: quoting needed");
    // Test both individual double quotes, and pairs of double quotes.
    inputLabels.push_back ("Not quoted \" quoting needed");
    inputLabels.push_back ("Not quoted \" \" quoting needed");
    inputLabels.push_back ("\"Already quoted \" quoting needed\"");
    inputLabels.push_back ("\"Already quoted \" \" quoting needed\"");
    // Remember that in C strings, a double backslash turns into a
    // single backslash.  Our YAML output routine should turn each
    // single backslash back into a double backslash.
    inputLabels.push_back ("Not quoted \\ quoting needed");
    inputLabels.push_back ("Not quoted \\\\ quoting needed");
    inputLabels.push_back ("Not quoted \\ \\ quoting needed");
    inputLabels.push_back ("\"Already quoted \\ quoting needed\"");
    inputLabels.push_back ("\"Already quoted \\\\ quoting needed\"");
    inputLabels.push_back ("\"Already quoted \\ \\ quoting needed\"");

    outputLabels.push_back ("NoQuotingNeeded");
    outputLabels.push_back ("No quoting needed");
    outputLabels.push_back ("\"AlreadyQuotedNoQuotingNeeded\"");
    outputLabels.push_back ("\"Already quoted, no quoting needed\"");
    outputLabels.push_back ("\"Already quoted: quoting needed\"");
    outputLabels.push_back ("\"NotQuoted:QuotingNeeded\"");
    outputLabels.push_back ("\"Not quoted: quoting needed\"");
    outputLabels.push_back ("\"Not quoted \\\" quoting needed\"");
    outputLabels.push_back ("\"Not quoted \\\" \\\" quoting needed\"");
    outputLabels.push_back ("\"Already quoted \\\" quoting needed\"");
    outputLabels.push_back ("\"Already quoted \\\" \\\" quoting needed\"");
    outputLabels.push_back ("\"Not quoted \\\\ quoting needed\"");
    outputLabels.push_back ("\"Not quoted \\\\\\\\ quoting needed\"");
    outputLabels.push_back ("\"Not quoted \\\\ \\\\ quoting needed\"");
    outputLabels.push_back ("\"Already quoted \\\\ quoting needed\"");
    outputLabels.push_back ("\"Already quoted \\\\\\\\ quoting needed\"");
    outputLabels.push_back ("\"Already quoted \\\\ \\\\ quoting needed\"");

    // Sanity check.
      inputLabels.size () != outputLabels.size (),
      "The number of input labels is different than the number of output labels."
      "  Please ask a Teuchos developer to make sure that every test input "
      "label has a corresponding output label.");

    Array<RCP<Time> > timers;
    for (size_type i = 0; i < inputLabels.size (); ++i) {
      timers.push_back (TimeMonitor::getNewCounter (inputLabels[i]));

    // The actual number of operations in the loop is proportional to
    // the cube of the loop length.  Adjust the quantities below as
    // necessary to ensure the timer reports a nonzero elapsed time
    // for each of the invocations.
    const size_t loopLength = 25;
    for (int k = 0; k < 3; ++k) {
      for (size_type i = 0; i < timers.size (); ++i) {
        TimeMonitor timeMon (* timers[i]);
        slowLoop (loopLength);

    { // YAML output, compact style.
      std::ostringstream oss;
      RCP<ParameterList> reportParams =
        parameterList (* (TimeMonitor::getValidReportParameters ()));
      reportParams->set ("Report format", "YAML");
      reportParams->set ("YAML style", "compact");
      TimeMonitor::report (oss, reportParams);

      // Echo output to the FancyOStream out (which is a standard unit
      // test argument).  Output should only appear in "show all test
      // details" mode.
      out << oss.str () << std::endl;

      // Make sure that all timer labels appear correctly in the output.
      for (size_type i = 0; i < inputLabels.size(); ++i) {
        const size_t pos = oss.str ().find (outputLabels[i]);
        TEST_INEQUALITY(pos, std::string::npos);

    { // YAML output, spacious style.
      std::ostringstream oss;
      RCP<ParameterList> reportParams =
        parameterList (* (TimeMonitor::getValidReportParameters ()));
      reportParams->set ("Report format", "YAML");
      reportParams->set ("YAML style", "spacious");
      TimeMonitor::report (oss, reportParams);

      // Echo output to the FancyOStream out (which is a standard unit
      // test argument).  Output should only appear in "show all test
      // details" mode.
      out << oss.str () << std::endl;

      // Make sure that all timer labels appear correctly in the output.
      for (size_type i = 0; i < inputLabels.size(); ++i) {
        const size_t pos = oss.str ().find (outputLabels[i]);
        TEST_INEQUALITY(pos, std::string::npos);

    // This sets up for the next unit test.
    TimeMonitor::clearCounters ();
void Piro::RythmosSolver<Scalar>::initialize(
    const Teuchos::RCP<Teuchos::ParameterList> &appParams,
    const Teuchos::RCP< Thyra::ModelEvaluator<Scalar> > &in_model,
    const Teuchos::RCP<Rythmos::IntegrationObserverBase<Scalar> > &observer)

    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;
    using Teuchos::rcp;

    // set some internals
    model = in_model;
    num_p = in_model->Np();
    num_g = in_model->Ng();

    *out << "\nA) Get the base parameter list ...\n";

    if (appParams->isSublist("Rythmos")) {
        RCP<Teuchos::ParameterList> rythmosPL = sublist(appParams, "Rythmos", true);

            const std::string verbosity = rythmosPL->get("Verbosity Level", "VERB_DEFAULT");
            if      (verbosity == "VERB_NONE")    solnVerbLevel = Teuchos::VERB_NONE;
            else if (verbosity == "VERB_DEFAULT") solnVerbLevel = Teuchos::VERB_DEFAULT;
            else if (verbosity == "VERB_LOW")     solnVerbLevel = Teuchos::VERB_LOW;
            else if (verbosity == "VERB_MEDIUM")  solnVerbLevel = Teuchos::VERB_MEDIUM;
            else if (verbosity == "VERB_HIGH")    solnVerbLevel = Teuchos::VERB_HIGH;
            else if (verbosity == "VERB_EXTREME") solnVerbLevel = Teuchos::VERB_EXTREME;
            else TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Unknown verbosity option specified in Piro_RythmosSolver.");

        t_initial = rythmosPL->get("Initial Time", 0.0);
        t_final = rythmosPL->get("Final Time", 0.1);

        const std::string stepperType = rythmosPL->get("Stepper Type", "Backward Euler");

        *out << "\nC) Create and initalize the forward model ...\n";

        *out << "\nD) Create the stepper and integrator for the forward problem ...\n";

        if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "Rythmos") {
            Teuchos::RCP<Rythmos::TimeStepNonlinearSolver<Scalar> > rythmosTimeStepSolver =
            if (rythmosPL->getEntryPtr("NonLinear Solver")) {
                RCP<Teuchos::ParameterList> nonlinePL =
                    sublist(rythmosPL, "NonLinear Solver", true);
            fwdTimeStepSolver = rythmosTimeStepSolver;
        else if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "NOX") {
            Teuchos::RCP<Thyra::NOXNonlinearSolver> nox_solver =  Teuchos::rcp(new Thyra::NOXNonlinearSolver);
            Teuchos::RCP<Teuchos::ParameterList> nox_params = Teuchos::rcp(new Teuchos::ParameterList);
            *nox_params = appParams->sublist("NOX");
            fwdTimeStepSolver = nox_solver;
            TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Requested NOX solver for a Rythmos Transient solve, Trilinos was not built with NOX enabled.  Please rebuild Trilinos or use the native Rythmos nonlinear solver.");


        if (stepperType == "Backward Euler") {
            fwdStateStepper = Rythmos::backwardEulerStepper<Scalar> (model, fwdTimeStepSolver);
            fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
        else if (stepperType == "Forward Euler") {
            fwdStateStepper = Rythmos::forwardEulerStepper<Scalar> (model);
            fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
        else if (stepperType == "Explicit RK") {
            fwdStateStepper = Rythmos::explicitRKStepper<Scalar>(model);
            fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
        else if (stepperType == "BDF") {
            Teuchos::RCP<Teuchos::ParameterList> BDFparams =
                Teuchos::sublist(rythmosPL, "Rythmos Stepper", true);
            Teuchos::RCP<Teuchos::ParameterList> BDFStepControlPL =
                Teuchos::sublist(BDFparams,"Step Control Settings");

            fwdStateStepper = Teuchos::rcp( new Rythmos::ImplicitBDFStepper<Scalar>(model,fwdTimeStepSolver,BDFparams) );

        else {
            // first (before failing) check to see if the user has added stepper factory
            typename std::map<std::string,Teuchos::RCP<Piro::RythmosStepperFactory<Scalar> > >::const_iterator
            stepFactItr = stepperFactories.find(stepperType);
            if(stepFactItr!=stepperFactories.end()) {
                // the user has added it, hot dog lets build a new stepper!
                Teuchos::RCP<Teuchos::ParameterList> stepperParams = Teuchos::sublist(rythmosPL, "Rythmos Stepper", true);

                // build the stepper using the factory
                fwdStateStepper = stepFactItr->second->buildStepper(model,fwdTimeStepSolver,stepperParams);

                // the user decided to override the model being used (let them)
                if(fwdStateStepper->getModel()!=model && fwdStateStepper->getModel()!=Teuchos::null) {
                    model = Teuchos::rcp_const_cast<Thyra::ModelEvaluator<Scalar> >(fwdStateStepper->getModel());

                    num_p = in_model->Np();
                    num_g = in_model->Ng();
            else {
                    true, Teuchos::Exceptions::InvalidParameter,
                    std::endl << "Error! Piro::RythmosSolver: Invalid Steper Type: "
                    << stepperType << std::endl);

        // Step control strategy
            // If the stepper can accept a step control strategy, then attempt to build one.
            RCP<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> > scsa_stepper =
                Teuchos::rcp_dynamic_cast<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> >(fwdStateStepper);

            if (Teuchos::nonnull(scsa_stepper)) {
                const std::string step_control_strategy = rythmosPL->get("Step Control Strategy Type", "None");

                if (step_control_strategy == "None") {
                    // don't do anything, stepper will build default
                } else if (step_control_strategy == "ImplicitBDFRamping") {

                    const RCP<Rythmos::ImplicitBDFStepperRampingStepControl<Scalar> > rscs =
                        rcp(new Rythmos::ImplicitBDFStepperRampingStepControl<Scalar>);

                    const RCP<ParameterList> p = parameterList(rythmosPL->sublist("Rythmos Step Control Strategy"));

                else {
                    // first (before failing) check to see if the user has added step control factory
                    typename std::map<std::string,Teuchos::RCP<Piro::RythmosStepControlFactory<Scalar> > >::const_iterator
                    stepControlFactItr = stepControlFactories.find(step_control_strategy);
                    if (stepControlFactItr != stepControlFactories.end())

                        const RCP<Rythmos::StepControlStrategyBase<Scalar> > rscs = stepControlFactItr->second->buildStepControl();

                        const RCP<ParameterList> p = parameterList(rythmosPL -> sublist("Rythmos Step Control Strategy"));


                    else {
                            true, std::logic_error,
                            "Error! Piro::RythmosSolver: Invalid step control strategy type: "
                            << step_control_strategy << std::endl);
            const RCP<Teuchos::ParameterList> integrationControlPL =
                Teuchos::sublist(rythmosPL, "Rythmos Integration Control", true);

            RCP<Rythmos::DefaultIntegrator<Scalar> > defaultIntegrator;
            if (rythmosPL->get("Rythmos Integration Control Strategy", "Simple") == "Simple") {
                defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::simpleIntegrationControlStrategy<Scalar>(integrationControlPL));
            else if(rythmosPL->get<std::string>("Rythmos Integration Control Strategy") == "Ramping") {
                defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::rampingIntegrationControlStrategy<Scalar>(integrationControlPL));
            fwdStateIntegrator = defaultIntegrator;

        fwdStateIntegrator->setParameterList(sublist(rythmosPL, "Rythmos Integrator", true));

        if (Teuchos::nonnull(observer)) {

    else if (appParams->isSublist("Rythmos Solver")) {
        /** New parameter list format **/
        RCP<Teuchos::ParameterList> rythmosSolverPL = sublist(appParams, "Rythmos Solver", true);
        RCP<Teuchos::ParameterList> rythmosPL = sublist(rythmosSolverPL, "Rythmos", true);

            const std::string verbosity = rythmosSolverPL->get("Verbosity Level", "VERB_DEFAULT");
            if      (verbosity == "VERB_NONE")    solnVerbLevel = Teuchos::VERB_NONE;
            else if (verbosity == "VERB_DEFAULT") solnVerbLevel = Teuchos::VERB_DEFAULT;
            else if (verbosity == "VERB_LOW")     solnVerbLevel = Teuchos::VERB_LOW;
            else if (verbosity == "VERB_MEDIUM")  solnVerbLevel = Teuchos::VERB_MEDIUM;
            else if (verbosity == "VERB_HIGH")    solnVerbLevel = Teuchos::VERB_HIGH;
            else if (verbosity == "VERB_EXTREME") solnVerbLevel = Teuchos::VERB_EXTREME;
            else TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
                                                "Unknown verbosity option specified in Piro_RythmosSolver.");

        t_initial = rythmosPL->sublist("Integrator Settings").get("Initial Time", 0.0);
        t_final = rythmosPL->sublist("Integrator Settings").get("Final Time", 0.1);

        const std::string stepperType = rythmosPL->sublist("Stepper Settings")
                                        .sublist("Stepper Selection").get("Stepper Type", "Backward Euler");
        //    *out << "\nB) Create the Stratimikos linear solver factory ...\n";
        // This is the linear solve strategy that will be used to solve for the
        // linear system with the W.
        Stratimikos::DefaultLinearSolverBuilder linearSolverBuilder;

        typedef Thyra::PreconditionerFactoryBase<double> Base;
        typedef Thyra::Ifpack2PreconditionerFactory<Tpetra::CrsMatrix<double, LocalOrdinal, GlobalOrdinal, Node> > Impl;
        typedef Thyra::Ifpack2PreconditionerFactory<Tpetra::CrsMatrix<double> > Impl;
        linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd<Base, Impl>(), "Ifpack2");
        Stratimikos::enableMueLu<LocalOrdinal, GlobalOrdinal, Node>(linearSolverBuilder);

        linearSolverBuilder.setParameterList(sublist(rythmosSolverPL, "Stratimikos", true));
        RCP<Thyra::LinearOpWithSolveFactoryBase<double> > lowsFactory =
        *out << "\nC) Create and initalize the forward model ...\n";
        // C.1) Create the underlying EpetraExt::ModelEvaluator
        // already constructed as "model". Decorate if needed.
        // TODO: Generelize to any explicit method, option to invert mass matrix
        if (stepperType == "Explicit RK") {
            if (rythmosSolverPL->get("Invert Mass Matrix", false)) {
                Teuchos::RCP<Thyra::ModelEvaluator<Scalar> > origModel = model;
                rythmosSolverPL->get("Lump Mass Matrix", false);  //JF line does not do anything
                model = Teuchos::rcp(new Piro::InvertMassMatrixDecorator<Scalar>(
                                         sublist(rythmosSolverPL,"Stratimikos", true), origModel,
                                         true,rythmosSolverPL->get("Lump Mass Matrix", false),false));
        // C.2) Create the Thyra-wrapped ModelEvaluator

        thyraModel = rcp(new Thyra::DefaultModelEvaluatorWithSolveFactory<Scalar>(model, lowsFactory));

        const RCP<const Thyra::VectorSpaceBase<double> > x_space =

        *out << "\nD) Create the stepper and integrator for the forward problem ...\n";
        fwdTimeStepSolver = Rythmos::timeStepNonlinearSolver<double>();

        if (rythmosSolverPL->getEntryPtr("NonLinear Solver")) {
            const RCP<Teuchos::ParameterList> nonlinePL =
                sublist(rythmosSolverPL, "NonLinear Solver", true);
        // Force Default Integrator since this is needed for Observers
        rythmosPL->sublist("Integrator Settings").sublist("Integrator Selection").
        set("Integrator Type","Default Integrator");

        RCP<Rythmos::IntegratorBuilder<double> > ib = Rythmos::integratorBuilder<double>();
        Thyra::ModelEvaluatorBase::InArgs<double> ic = thyraModel->getNominalValues();
        RCP<Rythmos::IntegratorBase<double> > integrator = ib->create(thyraModel,ic,fwdTimeStepSolver);
        fwdStateIntegrator = Teuchos::rcp_dynamic_cast<Rythmos::DefaultIntegrator<double> >(integrator,true);

        fwdStateStepper = fwdStateIntegrator->getNonconstStepper();

        if (Teuchos::nonnull(observer))

    else {
            appParams->isSublist("Rythmos") || appParams->isSublist("Rythmos Solver"),
            Teuchos::Exceptions::InvalidParameter, std::endl <<
            "Error! Piro::RythmosSolver: must have either Rythmos or Rythmos Solver sublist ");


    isInitialized = true;
void Piro::RythmosSolver<Scalar>::initialize(
    const Teuchos::RCP<Teuchos::ParameterList> &appParams,
    const Teuchos::RCP< Thyra::ModelEvaluator<Scalar> > &in_model,
    const Teuchos::RCP<Rythmos::IntegrationObserverBase<Scalar> > &observer)
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;

  // set some internals
  model = in_model;
  num_p = in_model->Np();
  num_g = in_model->Ng();

  *out << "\nA) Get the base parameter list ...\n";

  RCP<Teuchos::ParameterList> rythmosPL = sublist(appParams, "Rythmos", true);

    const std::string verbosity = rythmosPL->get("Verbosity Level", "VERB_DEFAULT");
    if      (verbosity == "VERB_NONE")    solnVerbLevel = Teuchos::VERB_NONE;
    else if (verbosity == "VERB_DEFAULT") solnVerbLevel = Teuchos::VERB_DEFAULT;
    else if (verbosity == "VERB_LOW")     solnVerbLevel = Teuchos::VERB_LOW;
    else if (verbosity == "VERB_MEDIUM")  solnVerbLevel = Teuchos::VERB_MEDIUM;
    else if (verbosity == "VERB_HIGH")    solnVerbLevel = Teuchos::VERB_HIGH;
    else if (verbosity == "VERB_EXTREME") solnVerbLevel = Teuchos::VERB_EXTREME;
    else TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Unknown verbosity option specified in Piro_RythmosSolver.");

  t_initial = rythmosPL->get("Initial Time", 0.0);
  t_final = rythmosPL->get("Final Time", 0.1);

  const std::string stepperType = rythmosPL->get("Stepper Type", "Backward Euler");

  *out << "\nC) Create and initalize the forward model ...\n";

  *out << "\nD) Create the stepper and integrator for the forward problem ...\n";

  if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "Rythmos") {
    Teuchos::RCP<Rythmos::TimeStepNonlinearSolver<Scalar> > rythmosTimeStepSolver =
    if (rythmosPL->getEntryPtr("NonLinear Solver")) {
      RCP<Teuchos::ParameterList> nonlinePL =
	sublist(rythmosPL, "NonLinear Solver", true);
    fwdTimeStepSolver = rythmosTimeStepSolver;
  else if (rythmosPL->get<std::string>("Nonlinear Solver Type") == "NOX") {
#ifdef Piro_ENABLE_NOX
    Teuchos::RCP<Thyra::NOXNonlinearSolver> nox_solver =  Teuchos::rcp(new Thyra::NOXNonlinearSolver);
    Teuchos::RCP<Teuchos::ParameterList> nox_params = Teuchos::rcp(new Teuchos::ParameterList);
    *nox_params = appParams->sublist("NOX");
    fwdTimeStepSolver = nox_solver;
    TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,"Requested NOX solver for a Rythmos Transient solve, Trilinos was not built with NOX enabled.  Please rebuild Trilinos or use the native Rythmos nonlinear solver.");


  if (stepperType == "Backward Euler") {
    fwdStateStepper = Rythmos::backwardEulerStepper<Scalar> (model, fwdTimeStepSolver);
    fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
  else if (stepperType == "Forward Euler") {
    fwdStateStepper = Rythmos::forwardEulerStepper<Scalar> (model);
    fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
  else if (stepperType == "Explicit RK") {
    fwdStateStepper = Rythmos::explicitRKStepper<Scalar>(model);
    fwdStateStepper->setParameterList(sublist(rythmosPL, "Rythmos Stepper", true));
  else if (stepperType == "BDF") {
    Teuchos::RCP<Teuchos::ParameterList> BDFparams =
      Teuchos::sublist(rythmosPL, "Rythmos Stepper", true);
    Teuchos::RCP<Teuchos::ParameterList> BDFStepControlPL =
      Teuchos::sublist(BDFparams,"Step Control Settings");

    fwdStateStepper = Teuchos::rcp( new Rythmos::ImplicitBDFStepper<Scalar>(model,fwdTimeStepSolver,BDFparams) );

  else {
    // first (before failing) check to see if the user has added stepper factory
    typename std::map<std::string,Teuchos::RCP<RythmosStepperFactory<Scalar> > >::const_iterator
        stepFactItr = stepperFactories.find(stepperType);
    if(stepFactItr!=stepperFactories.end()) {
      // the user has added it, hot dog lets build a new stepper!
      Teuchos::RCP<Teuchos::ParameterList> stepperParams = Teuchos::sublist(rythmosPL, "Rythmos Stepper", true);

      // build the stepper using the factory
      fwdStateStepper = stepFactItr->second->buildStepper(model,fwdTimeStepSolver,stepperParams);
    else {
          true, Teuchos::Exceptions::InvalidParameter,
          std::endl << "Error! Piro::Epetra::RythmosSolver: Invalid Steper Type: "
          << stepperType << std::endl);

  // Step control strategy
    // If the stepper can accept a step control strategy, then attempt to build one.
    RCP<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> > scsa_stepper =
      Teuchos::rcp_dynamic_cast<Rythmos::StepControlStrategyAcceptingStepperBase<Scalar> >(fwdStateStepper);

    if (Teuchos::nonnull(scsa_stepper)) {
      const std::string step_control_strategy = rythmosPL->get("Step Control Strategy Type", "None");

      if (step_control_strategy == "None") {
        // don't do anything, stepper will build default
      } else if (step_control_strategy == "ImplicitBDFRamping") {

        const RCP<Rythmos::ImplicitBDFStepperRampingStepControl<Scalar> > rscs =
          rcp(new Rythmos::ImplicitBDFStepperRampingStepControl<Scalar>);

        const RCP<ParameterList> p = parameterList(rythmosPL->sublist("Rythmos Step Control Strategy"));

      } else {
            true, std::logic_error,
            "Error! Piro::Epetra::RythmosSolver: Invalid step control strategy type: "
            << step_control_strategy << std::endl);

    const RCP<Teuchos::ParameterList> integrationControlPL =
      Teuchos::sublist(rythmosPL, "Rythmos Integration Control", true);

    RCP<Rythmos::DefaultIntegrator<Scalar> > defaultIntegrator;
    if (rythmosPL->get("Rythmos Integration Control Strategy", "Simple") == "Simple") {
      defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::simpleIntegrationControlStrategy<Scalar>(integrationControlPL));
    else if(rythmosPL->get<std::string>("Rythmos Integration Control Strategy") == "Ramping") {
      defaultIntegrator = Rythmos::controlledDefaultIntegrator<Scalar>(Rythmos::rampingIntegrationControlStrategy<Scalar>(integrationControlPL));
    fwdStateIntegrator = defaultIntegrator;

  fwdStateIntegrator->setParameterList(sublist(rythmosPL, "Rythmos Integrator", true));

  if (Teuchos::nonnull(observer)) {

  isInitialized = true;
main (int argc, char *argv[]) 
  using Teuchos::Comm;
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::oblackholestream;
  using Teuchos::OSTab;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcpFromRef;
  using std::cout;
  using std::endl;
  // Typedefs for Tpetra template arguments.
  typedef double scalar_type;
  typedef long int global_ordinal_type;
  typedef int local_ordinal_type;
  typedef Kokkos::DefaultNode::DefaultNodeType node_type;
  // Tpetra objects which are the MV and OP template parameters of the
  // Belos specialization which we are testing.
  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type> MV;
  typedef Tpetra::Operator<scalar_type, local_ordinal_type, global_ordinal_type, node_type> OP;
  // Other typedefs.
  typedef Teuchos::ScalarTraits<scalar_type> STS;
  typedef Tpetra::CrsMatrix<scalar_type, local_ordinal_type, global_ordinal_type, node_type> sparse_matrix_type;

  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &cout);
  RCP<const Comm<int> > comm = Tpetra::DefaultPlatform::getDefaultPlatform().getComm();
  RCP<node_type> node = Tpetra::DefaultPlatform::getDefaultPlatform().getNode();
  RCP<oblackholestream> blackHole (new oblackholestream);
  const int myRank = comm->getRank();

  // Output stream that prints only on Rank 0.
  RCP<FancyOStream> out;
  if (myRank == 0) {
    out = Teuchos::getFancyOStream (rcpFromRef (cout));
  } else {
    out = Teuchos::getFancyOStream (blackHole);

  // Get test parameters from command-line processor.
  // CommandLineProcessor always understands int, but may not
  // understand global_ordinal_type.  We convert to the latter below.
  int numRows = comm->getSize() * 100;
  bool tolerant = false;
  bool verbose = false;
  bool debug = false;
  Teuchos::CommandLineProcessor cmdp (false, true);
  cmdp.setOption("numRows", &numRows,
		 "Global number of rows (and columns) in the sparse matrix to generate.");
  cmdp.setOption("tolerant", "intolerant", &tolerant,
		 "Whether to parse files tolerantly.");
  cmdp.setOption("verbose", "quiet", &verbose, 
		 "Print messages and results.");
  cmdp.setOption("debug", "release", &debug, 
		 "Run debugging checks and print copious debugging output.");
  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
    *out << "\nEnd Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  // Output stream for verbose output.
  RCP<FancyOStream> verbOut = verbose ? out : getFancyOStream (blackHole);

  const bool success = true;

  // Test whether it's possible to instantiate the solver.
  // This is a minimal compilation test.
  *verbOut << "Instantiating Block GCRODR solver" << endl;
  Belos::BlockGCRODRSolMgr<scalar_type, MV, OP> solver;
  // Test setting solver parameters.  For now, we just use an empty
  // (but non-null) parameter list, which the solver should fill in
  // with defaults.
  *verbOut << "Setting solver parameters" << endl;
  RCP<ParameterList> solverParams = parameterList ();
  solver.setParameters (solverParams);
  // Create a linear system to solve.
  *verbOut << "Creating linear system" << endl;
  RCP<sparse_matrix_type> A;
  RCP<MV> X_guess, X_exact, B;
    typedef Belos::Tpetra::ProblemMaker<sparse_matrix_type> factory_type;
    factory_type factory (comm, node, out, tolerant, debug);
    RCP<ParameterList> problemParams = parameterList ();
    problemParams->set ("Global number of rows", 
			static_cast<global_ordinal_type> (numRows));
    problemParams->set ("Problem type", std::string ("Nonsymmetric"));
    factory.makeProblem (A, X_guess, X_exact, B, problemParams);
  // Approximate solution vector is a copy of the guess vector.
  RCP<MV> X (new MV (*X_guess));

  TEUCHOS_TEST_FOR_EXCEPTION(A.is_null(), std::logic_error,
			     "The sparse matrix is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X_guess.is_null(), std::logic_error,
			     "The initial guess X_guess is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X_exact.is_null(), std::logic_error,
			     "The exact solution X_exact is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(B.is_null(), std::logic_error,
			     "The right-hand side B is null!");
  TEUCHOS_TEST_FOR_EXCEPTION(X.is_null(), std::logic_error,
			     "The approximate solution vector X is null!");

  typedef Belos::LinearProblem<scalar_type, MV, OP> problem_type;
  RCP<problem_type> problem (new problem_type (A, X, B));
  problem->setProblem ();
  solver.setProblem (problem);

  *verbOut << "Solving linear system" << endl;
  Belos::ReturnType result = solver.solve ();

  *verbOut << "Result of solve: " 
	   << Belos::convertReturnTypeToString (result) 
	   << endl;
  if (success) {
    *out << "\nEnd Result: TEST PASSED" << endl;
    return EXIT_SUCCESS;
  else {
    *out << "\nEnd Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
main (int argc, char *argv[]) 
  using Teuchos::as;
  using Teuchos::Comm;
  using Teuchos::FancyOStream;
  using Teuchos::getFancyOStream;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using std::cerr;
  using std::cout;
  using std::endl;

  typedef double scalar_type;
  typedef int local_ordinal_type;
  typedef long global_ordinal_type;
  typedef int  global_ordinal_type;
  typedef Kokkos::SerialNode node_type;

  Teuchos::oblackholestream blackHole;
  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
  RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();

  // Read in command line arguments.
  int unknownsPerNode = 20; // number of unknowns per process
  int unknownsPerElt = 3; // number of unknowns per (overlapping) element
  int numCols = 1;
  bool verbose = false;

  Teuchos::CommandLineProcessor cmdp (false, true);
  cmdp.setOption ("unknownsPerNode", &unknownsPerNode, 
		  "Number of unknowns per process");
  cmdp.setOption ("unknownsPerElt", &unknownsPerElt, 
		  "Number of unknowns per (overlapping) element.");
  cmdp.setOption ("numCols", &numCols, 
		  "Number of columns in the multivector.  Must be positive.");
  cmdp.setOption ("verbose", "quiet", &verbose, 
		  "Whether to print verbose output.");
  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
    return EXIT_FAILURE;

  const Teuchos::EVerbosityLevel verbLevel = 
    verbose ? Teuchos::VERB_EXTREME : Teuchos::VERB_DEFAULT;
  RCP<FancyOStream> out = verbose ? getFancyOStream (rcpFromRef (std::cout)) :
    getFancyOStream (rcpFromRef (blackHole));

  RCP<ParameterList> nodeParams = parameterList ("Kokkos Node");
  RCP<node_type> node = makeSerialNode (nodeParams);

  // Run the test.
  bool succeeded = true;
  try {
      node_type> (comm, node, as<size_t> (unknownsPerNode),
		  as<global_ordinal_type> (unknownsPerElt), 
		  as<size_t> (numCols), out, verbLevel);
    succeeded = true;
  } catch (std::exception& e) {
    *out << "MultiVectorFiller test threw an exception:  " << e.what() << endl;
    succeeded = false;

  const int localSuccess = succeeded ? 1 : 0;
  int globalSuccess = localSuccess;
  Teuchos::reduceAll (*comm, Teuchos::REDUCE_SUM, localSuccess, 
		      Teuchos::ptr (&globalSuccess));
  if (globalSuccess) {
    std::cout << "End Result: TEST PASSED" << endl;
    return EXIT_SUCCESS;
  else {
    std::cout << "End Result: TEST FAILED" << endl;
    return EXIT_FAILURE;
  // Test filtering of timer labels.
  TEUCHOS_UNIT_TEST( TimeMonitor, TimerLabelFiltering )
    using Teuchos::Array;
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;
    using Teuchos::Time;
    typedef Array<std::string>::size_type size_type;

    // Filters to use in the test.
    Array<std::string> filters;
    filters.push_back ("Foo:");
    filters.push_back ("Bar:");
    filters.push_back ("Baz:");

    // All the timer labels.
    Array<std::string> labels;
    labels.push_back ("Foo: timer 1");
    labels.push_back ("Foo: timer 2");
    labels.push_back ("Foo: timer 3");
    labels.push_back ("Bar: timer 1");
    labels.push_back ("Bar: timer 2");
    labels.push_back ("Baz: timer 1");
    labels.push_back ("Xyzzy");
    labels.push_back ("This is not a pipe");
    labels.push_back ("You should not see this");

    Array<Array<std::string> > outLabels (3);
    // Label(s) that should be printed for filters[0]
    outLabels[0].push_back ("Foo: timer 1");
    outLabels[0].push_back ("Foo: timer 2");
    outLabels[0].push_back ("Foo: timer 3");
    // Label(s) that should be printed for filters[1]
    outLabels[1].push_back ("Bar: timer 1");
    outLabels[1].push_back ("Bar: timer 2");
    // Label(s) that should be printed for filters[2]
    outLabels[2].push_back ("Baz: timer 1");

    // Labels that should not be printed for any of the filters below.
    Array<std::string> otherLabels;
    otherLabels.push_back ("Xyzzy");
    otherLabels.push_back ("This is not a pipe");
    otherLabels.push_back ("You should not see this");

    Array<RCP<Time> > timers;
    for (size_type i = 0; i < labels.size (); ++i) {
      timers.push_back (TimeMonitor::getNewCounter (labels[i]));

    // The actual number of operations in the loop is proportional to
    // the cube of the loop length.  Adjust the quantities below as
    // necessary to ensure the timer reports a nonzero elapsed time
    // for each of the invocations.
    const size_t loopLength = 25;
    for (int k = 0; k < 3; ++k) {
      for (size_type i = 0; i < timers.size (); ++i) {
        TimeMonitor timeMon (* timers[i]);
        slowLoop (loopLength);

    try {
      // FIXME (mfh 21 Aug 2012) We don't yet have a test ensuring that
      // the filter only selects at the beginning of the timer label.

      // Test for each filter.
      for (size_type i = 0; i < filters.size (); ++i) {
        { // Default (tabular) output format.
          std::ostringstream oss;
          RCP<ParameterList> reportParams =
            parameterList (* (TimeMonitor::getValidReportParameters ()));
          TimeMonitor::report (oss, filters[i], reportParams);

          // Echo output to the FancyOStream out (which is a standard unit
          // test argument).  Output should only appear in "show all test
          // details" mode.
          out << oss.str () << std::endl;

          // Check whether the labels that were supposed to be printed
          // were actually printed.
          for (size_type j = 0; j < outLabels[i].size(); ++j) {
            const size_t pos = oss.str ().find (outLabels[i][j]);
            TEST_INEQUALITY(pos, std::string::npos);

          // Check whether the labels that were _not_ supposed to be
          // printed were actually printed.
          // First, check the labels that should only be printed with
          // the other filters.
          for (size_type ii = 0; ii < outLabels.size(); ++ii) {
            if (ii != i) {
              for (size_type j = 0; j < outLabels[ii].size(); ++j) {
                const size_t pos = oss.str ().find (outLabels[ii][j]);
                TEST_EQUALITY(pos, std::string::npos);
          // Next, check the labels that should not be printed for any
          // filters.
          for (size_type j = 0; j < otherLabels.size(); ++j) {
            const size_t pos = oss.str ().find (otherLabels[j]);
            TEST_EQUALITY(pos, std::string::npos);

        { // YAML output, compact style.
          std::ostringstream oss;
          RCP<ParameterList> reportParams =
            parameterList (* (TimeMonitor::getValidReportParameters ()));
          reportParams->set ("Report format", "YAML");
          reportParams->set ("YAML style", "compact");
          TimeMonitor::report (oss, filters[i], reportParams);

          // Echo output to the FancyOStream out (which is a standard unit
          // test argument).  Output should only appear in "show all test
          // details" mode.
          out << oss.str () << std::endl;

          // Check whether the labels that were supposed to be printed
          // were actually printed.
          for (size_type j = 0; j < outLabels[i].size(); ++j) {
            const size_t pos = oss.str ().find (outLabels[i][j]);
            TEST_INEQUALITY(pos, std::string::npos);

          // Check whether the labels that were _not_ supposed to be
          // printed were actually printed.
          // First, check the labels that should only be printed with
          // the other filters.
          for (size_type ii = 0; ii < outLabels.size(); ++ii) {
            if (ii != i) {
              for (size_type j = 0; j < outLabels[ii].size(); ++j) {
                const size_t pos = oss.str ().find (outLabels[ii][j]);
                TEST_EQUALITY(pos, std::string::npos);
          // Next, check the labels that should not be printed for any
          // filters.
          for (size_type j = 0; j < otherLabels.size(); ++j) {
            const size_t pos = oss.str ().find (otherLabels[j]);
            TEST_EQUALITY(pos, std::string::npos);

        { // YAML output, spacious style.
          std::ostringstream oss;
          RCP<ParameterList> reportParams =
            parameterList (* (TimeMonitor::getValidReportParameters ()));
          reportParams->set ("Report format", "YAML");
          reportParams->set ("YAML style", "spacious");
          TimeMonitor::report (oss, filters[i], reportParams);

          // Echo output to the FancyOStream out (which is a standard unit
          // test argument).  Output should only appear in "show all test
          // details" mode.
          out << oss.str () << std::endl;

          // Check whether the labels that were supposed to be printed
          // were actually printed.
          for (size_type j = 0; j < outLabels[i].size(); ++j) {
            const size_t pos = oss.str ().find (outLabels[i][j]);
            TEST_INEQUALITY(pos, std::string::npos);

          // Check whether the labels that were _not_ supposed to be
          // printed were actually printed.
          // First, check the labels that should only be printed with
          // the other filters.
          for (size_type ii = 0; ii < outLabels.size(); ++ii) {
            if (ii != i) {
              for (size_type j = 0; j < outLabels[ii].size(); ++j) {
                const size_t pos = oss.str ().find (outLabels[ii][j]);
                TEST_EQUALITY(pos, std::string::npos);
          // Next, check the labels that should not be printed for any
          // filters.
          for (size_type j = 0; j < otherLabels.size(); ++j) {
            const size_t pos = oss.str ().find (otherLabels[j]);
            TEST_EQUALITY(pos, std::string::npos);
    catch (...) {
      // Make sure to clear the counters, so that they don't pollute
      // the remaining tests.  (The Teuchos unit test framework may
      // catch any exceptions that the above code throws, but allow
      // the remaining tests to continue.)
      TimeMonitor::clearCounters ();

    // This sets up for the next unit test.
    TimeMonitor::clearCounters ();
int main(int argc, char* argv[])

  bool success = false;
  bool verbose = false;
    // Open an output file stream for writing our XML file
    std::ofstream out;
    // We will print to the 'out' ofstream, and that output will be
    // valid XML describing the validated ParameterList.  For the
    // purposes of generating nicely-formatted HTML documentation for
    // this ParameterList, we also need to include an XSL header line.
    // This bool will control whether we include this header line,
    // which can be controlled at the command line.
    bool xsl_header_flag = true;

    // Set up the command line processor.  All versions of this
    // executable should support the add-xsl-header /
    // suppress-xsl-header command line options.  If you want a single
    // executable to support multiple ParameterLists, you could put
    // additional options here to control which ParameterList to
    // output.
    CommandLineProcessor clp(false);  //don't throw exceptions
                  "XSL header flag");

    // Parse the command line and quit if not successful
    CommandLineProcessor::EParseCommandLineReturn parse_return =
      clp.parse(argc, argv);
    if(parse_return != CommandLineProcessor::PARSE_SUCCESSFUL)
      return parse_return;

    // Here is where code should go that is required to generate the
    // validated XML file.  If your class uses a construct-then-init
    // idiom, then this is where the default constructor would be
    // called.  If this executable supports ParameterLists for more
    // than one class, then this is where the logic to pick the
    // appropriate class would go.

	Belos::SolverFactory<scalar_type, multivector_type, operator_type> sFactory;
	RCP<solver_type> solver;
	for ( int i = 0; i < 4; ++i )
		RCP<ParameterList> nullParams = parameterList();
		std::cout << "writing parameter list " << i+1 << " of 9." << std::endl;
		if ( i == 0 ) 
			solver = sFactory.create("Block GMRES", nullParams);
			out.open("belos_BlockGmres.xml", std::ofstream::out);
			if ( xsl_header_flag )
				writeXSLHeader( out );
			RCP<const ParameterList> gmresParams = solver -> getValidParameters();
			Teuchos::writeParameterListToXmlOStream( *gmresParams, out);
		else if ( i == 1 ) 	
			solver = sFactory.create("Pseudo Block GMRES", nullParams);
			out.open("belos_PseudoBlockGmres.xml", std::ofstream::out);
			if ( xsl_header_flag )
				writeXSLHeader( out );
			RCP<const ParameterList> pseudoGmresParams = solver -> getValidParameters();
			Teuchos::writeParameterListToXmlOStream( *pseudoGmresParams, out);
		else if ( i == 2 ) 
			solver = sFactory.create("Block CG", nullParams);
			out.open("belos_BlockCG.xml", std::ofstream::out);
			if ( xsl_header_flag )
				writeXSLHeader( out );
			RCP<const ParameterList> blockCgParams = solver -> getValidParameters();
			Teuchos::writeParameterListToXmlOStream( *blockCgParams, out);
		else if ( i == 3 ) 
			solver = sFactory.create("Pseudo Block CG", nullParams);
			out.open("belos_PseudoBlockCG.xml", std::ofstream::out);
			if ( xsl_header_flag )
				writeXSLHeader( out );
			RCP<const ParameterList> pseudoBlockCgParams = solver -> getValidParameters();
			Teuchos::writeParameterListToXmlOStream( *pseudoBlockCgParams, out);

    success = true;
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );

main (int argc, char *argv[])
  using Teuchos::inOutArg;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using std::endl;
  typedef double                          ST;
  typedef Epetra_Operator                 OP;
  typedef Epetra_MultiVector              MV;
  typedef Belos::OperatorTraits<ST,MV,OP> OPT;
  typedef Belos::MultiVecTraits<ST,MV>    MVT;

  // This calls MPI_Init and MPI_Finalize as necessary.
  Belos::Test::MPISession session (inOutArg (argc), inOutArg (argv));
  RCP<const Epetra_Comm> comm = session.getComm ();

  bool success = false;
  bool verbose = false;
  try {
    int MyPID = comm->MyPID ();

    // Parameters to read from command-line processor
    int frequency = -1;  // how often residuals are printed by solver
    int numRHS = 1;  // total number of right-hand sides to solve for
    int maxIters = 13000;  // maximum number of iterations for solver to use
    std::string filename ("bcsstk14.hb");
    double tol = 1.0e-5; // relative residual tolerance

    // Read in command-line arguments
    Teuchos::CommandLineProcessor cmdp (false, true);
    cmdp.setOption ("verbose", "quiet", &verbose, "Print messages and results.");
    cmdp.setOption ("frequency", &frequency, "Solvers frequency for printing "
        "residuals (#iters).");
    cmdp.setOption ("tol", &tol, "Relative residual tolerance used by MINRES "
    cmdp.setOption ("filename", &filename, "Filename for Harwell-Boeing test "
    cmdp.setOption ("num-rhs", &numRHS, "Number of right-hand sides to solve.");
    cmdp.setOption ("max-iters", &maxIters, "Maximum number of iterations per "
        "linear system (-1 means \"adapt to problem/block size\").");
    if (cmdp.parse (argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) {
      return EXIT_FAILURE;
    Teuchos::oblackholestream blackHole;
    std::ostream& verbOut = (verbose && MyPID == 0) ? std::cout : blackHole;

    // Generate the linear system(s) to solve.
    verbOut << "Generating the linear system(s) to solve" << endl << endl;
    RCP<Epetra_CrsMatrix> A;
    RCP<Epetra_MultiVector> B, X;
    RCP<Epetra_Map> rowMap;
    try {
      // This might change the number of right-hand sides, if we read in
      // a right-hand side from the Harwell-Boeing file.
      Belos::Util::createEpetraProblem (filename, &rowMap, &A, &B, &X, &MyPID, numRHS);
    } catch (std::exception& e) {
      TEUCHOS_TEST_FOR_EXCEPTION (true, std::runtime_error,
          "Failed to create Epetra problem for matrix "
          "filename \"" << filename << "\".  "
          "createEpetraProblem() reports the following "
          "error: " << e.what());
    // Compute the initial residual norm of the problem, so we can see
    // by how much it improved after the solve.
    std::vector<double> initialResidualNorms (numRHS);
    std::vector<double> initialResidualInfNorms (numRHS);
    Epetra_MultiVector R (*rowMap, numRHS);
    OPT::Apply (*A, *X, R);
    MVT::MvAddMv (-1.0, R, 1.0, *B, R); // R := -(A*X) + B.
    MVT::MvNorm (R, initialResidualNorms);
    MVT::MvNorm (R, initialResidualInfNorms, Belos::InfNorm);
    if (verbose) {
      verbOut << "Initial residual 2-norms:            \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << initialResidualNorms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl << "Initial residual Inf-norms:          \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << initialResidualInfNorms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl;

    std::vector<double> rhs2Norms (numRHS);
    std::vector<double> rhsInfNorms (numRHS);
    MVT::MvNorm (*B, rhs2Norms);
    MVT::MvNorm (*B, rhsInfNorms, Belos::InfNorm);
    if (verbose) {
      verbOut << "Right-hand side 2-norms:             \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << rhs2Norms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl << "Right-hand side Inf-norms:           \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << rhsInfNorms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl;

    std::vector<double> initialGuess2Norms (numRHS);
    std::vector<double> initialGuessInfNorms (numRHS);
    MVT::MvNorm (*X, initialGuess2Norms);
    MVT::MvNorm (*X, initialGuessInfNorms, Belos::InfNorm);
    if (verbose) {
      verbOut << "Initial guess 2-norms:               \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << initialGuess2Norms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl << "Initial guess Inf-norms:             \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << initialGuessInfNorms[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl;
    // Compute the infinity-norm of A.
    const double normOfA = A->NormInf ();
    verbOut << "||A||_inf:                           \t" << normOfA << endl;
    // Compute ||A|| ||X_i|| + ||B_i|| for each right-hand side B_i.
    std::vector<double> scaleFactors (numRHS);
    for (int i = 0; i < numRHS; ++i) {
      scaleFactors[i] = normOfA * initialGuessInfNorms[i] + rhsInfNorms[i];
    if (verbose) {
      verbOut << "||A||_inf ||X_i||_inf + ||B_i||_inf: \t";
      for (int i = 0; i < numRHS; ++i) {
        verbOut << scaleFactors[i];
        if (i < numRHS-1) {
          verbOut << ", ";
      verbOut << endl;

    // Solve using Belos
    verbOut << endl << "Setting up Belos" << endl;
    const int NumGlobalElements = B->GlobalLength();

    // Set up Belos solver parameters.
    RCP<ParameterList> belosList = parameterList ("MINRES");
    belosList->set ("Maximum Iterations", maxIters);
    belosList->set ("Convergence Tolerance", tol);
    if (verbose) {
      belosList->set ("Verbosity", Belos::Errors + Belos::Warnings +
          Belos::IterationDetails + Belos::OrthoDetails +
          Belos::FinalSummary + Belos::TimingDetails + Belos::Debug);
      belosList->set ("Output Frequency", frequency);
    else {
      belosList->set ("Verbosity", Belos::Errors + Belos::Warnings);
    belosList->set ("Output Stream", rcpFromRef (verbOut));

    // Construct an unpreconditioned linear problem instance.
    typedef Belos::LinearProblem<double,MV,OP> prob_type;
    RCP<prob_type> problem = rcp (new prob_type (A, X, B));
    if (! problem->setProblem()) {
      verbOut << endl << "ERROR:  Failed to set up Belos::LinearProblem!" << endl;
      return EXIT_FAILURE;

    // Create an iterative solver manager.
    Belos::SolverFactory<double, MV, OP> factory;
    RCP<Belos::SolverManager<double,MV,OP> > newSolver =
      factory.create ("MINRES", belosList);
    newSolver->setProblem (problem);

    // Print out information about problem.  Make sure to use the
    // information as stored in the Belos ParameterList, so that we know
    // what the solver will do.
    verbOut << endl
      << "Dimension of matrix: " << NumGlobalElements << endl
      << "Number of right-hand sides: " << numRHS << endl
      << "Max number of MINRES iterations: "
      << belosList->get<int> ("Maximum Iterations") << endl
      << "Relative residual tolerance: "
      << belosList->get<double> ("Convergence Tolerance") << endl
      << "Output frequency: "
      << belosList->get<int> ("Output Frequency") << endl
      << endl;

    // Solve the linear system.
    verbOut << "Solving the linear system" << endl << endl;
    Belos::ReturnType ret = newSolver->solve();
    verbOut << "Belos results:" << endl
      << "- Number of iterations: "
      << newSolver->getNumIters () << endl
      << "- " << (ret == Belos::Converged ? "Converged" : "Not converged")
      << endl;
    // After the solve, compute residual(s) explicitly.  This tests
    // whether the Belos solver did so correctly.
    std::vector<double> absoluteResidualNorms (numRHS);
    OPT::Apply (*A, *X, R);
    MVT::MvAddMv (-1.0, R, 1.0, *B, R);
    MVT::MvNorm (R, absoluteResidualNorms);

    std::vector<double> relativeResidualNorms (numRHS);
    for (int i = 0; i < numRHS; ++i) {
      relativeResidualNorms[i] = (initialResidualNorms[i] == 0.0) ?
        absoluteResidualNorms[i] :
        absoluteResidualNorms[i] / initialResidualNorms[i];

    verbOut << "---------- Computed relative residual norms ----------"
      << endl << endl;
    bool badRes = false;
    if (verbose) {
      for (int i = 0; i < numRHS; ++i) {
        const double actRes = relativeResidualNorms[i];
        verbOut << "Problem " << i << " : \t" << actRes << endl;
        if (actRes > tol) {
          badRes = true;

    Teuchos::TimeMonitor::summarize (verbOut);

    success = (ret == Belos::Converged && !badRes);

    if (success) {
      verbOut << endl << "End Result: TEST PASSED" << endl;
    } else {
      verbOut << endl << "End Result: TEST FAILED" << endl;
  } // try
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  return success ? EXIT_SUCCESS : EXIT_FAILURE;
  // TimeMonitor nested timers test: create two timers on all (MPI)
  // processes, use the second inside the scope of the first, and make
  // sure that TimeMonitor::summarize() reports both timers.
    using Teuchos::ParameterList;
    using Teuchos::parameterList;
    using Teuchos::RCP;

    func_time_monitor2 ();

      std::ostringstream oss;
      TimeMonitor::summarize (oss);

      // Echo summarize() output to the FancyOStream out (which is a
      // standard unit test argument).  Output should only appear in
      // show-all-test-details mode.
      out << oss.str() << std::endl;

      const size_t substr_i = oss.str().find ("FUNC_TIME_MONITOR2");
      TEST_INEQUALITY(substr_i, std::string::npos);
      const size_t substr_inner_i = oss.str().find ("FUNC_TIME_MONITOR2_inner");
      TEST_INEQUALITY(substr_inner_i, std::string::npos);

    { // Repeat test for YAML output, compact style.
      std::ostringstream oss;
      RCP<ParameterList> reportParams =
        parameterList (* (TimeMonitor::getValidReportParameters ()));
      reportParams->set ("Report format", "YAML");
      reportParams->set ("YAML style", "compact");
      TimeMonitor::report (oss, reportParams);

      // Echo output to the FancyOStream out (which is a standard unit
      // test argument).  Output should only appear in "show all test
      // details" mode.
      out << oss.str () << std::endl;

      const size_t substr_i = oss.str().find ("FUNC_TIME_MONITOR2");
      TEST_INEQUALITY(substr_i, std::string::npos);
      const size_t substr_inner_i = oss.str().find ("FUNC_TIME_MONITOR2_inner");
      TEST_INEQUALITY(substr_inner_i, std::string::npos);

    { // Repeat test for YAML output, spacious style.
      std::ostringstream oss;
      RCP<ParameterList> reportParams =
        parameterList (* (TimeMonitor::getValidReportParameters ()));
      reportParams->set ("Report format", "YAML");
      reportParams->set ("YAML style", "spacious");
      TimeMonitor::report (oss, reportParams);

      // Echo output to the FancyOStream out (which is a standard unit
      // test argument).  Output should only appear in "show all test
      // details" mode.
      out << oss.str () << std::endl;

      const size_t substr_i = oss.str().find ("FUNC_TIME_MONITOR2");
      TEST_INEQUALITY(substr_i, std::string::npos);
      const size_t substr_inner_i = oss.str().find ("FUNC_TIME_MONITOR2_inner");
      TEST_INEQUALITY(substr_inner_i, std::string::npos);

    // This sets up for the next unit test.
    TimeMonitor::clearCounters ();