int main(int argc, char *argv[])
{
#ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS
  cout << "NOTE: enabling floating point exceptions for divide by zero.\n";
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif

  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
  int rank = Teuchos::GlobalMPISession::getRank();

#ifdef HAVE_MPI
  Epetra_MpiComm Comm(MPI_COMM_WORLD);
  //cout << "rank: " << rank << " of " << numProcs << endl;
#else
  Epetra_SerialComm Comm;
#endif

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  double minTol = 1e-8;

  bool use3D = false;
  int refCount = 10;

  int k = 4; // poly order for field variables
  int delta_k = use3D ? 3 : 2;   // test space enrichment
  int k_coarse = 0;

  bool useMumps = true;
  bool useGMGSolver = true;

  bool enforceOneIrregularity = true;
  bool useStaticCondensation = false;
  bool conformingTraces = false;
  bool useDiagonalScaling = false; // of the global stiffness matrix in GMGSolver

  bool printRefinementDetails = false;

  bool useWeightedGraphNorm = true; // graph norm scaled according to units, more or less

  int numCells = 2;

  int AztecOutputLevel = 1;
  int gmgMaxIterations = 10000;
  int smootherOverlap = 0;
  double relativeTol = 1e-6;
  double D = 1.0; // characteristic length scale

  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");
  cmdp.setOption("k_coarse", &k_coarse, "polynomial order for field variables on coarse mesh");
  cmdp.setOption("numRefs",&refCount,"number of refinements");
  cmdp.setOption("D", &D, "domain dimension");
  cmdp.setOption("useConformingTraces", "useNonConformingTraces", &conformingTraces);
  cmdp.setOption("enforceOneIrregularity", "dontEnforceOneIrregularity", &enforceOneIrregularity);

  cmdp.setOption("smootherOverlap", &smootherOverlap, "overlap for smoother");

  cmdp.setOption("printRefinementDetails", "dontPrintRefinementDetails", &printRefinementDetails);
  cmdp.setOption("azOutput", &AztecOutputLevel, "Aztec output level");
  cmdp.setOption("numCells", &numCells, "number of cells in the initial mesh");
  cmdp.setOption("useScaledGraphNorm", "dontUseScaledGraphNorm", &useWeightedGraphNorm);
//  cmdp.setOption("gmgTol", &gmgTolerance, "tolerance for GMG convergence");
  cmdp.setOption("relativeTol", &relativeTol, "Energy error-relative tolerance for iterative solver.");
  cmdp.setOption("gmgMaxIterations", &gmgMaxIterations, "tolerance for GMG convergence");

  bool enhanceUField = false;
  cmdp.setOption("enhanceUField", "dontEnhanceUField", &enhanceUField);
  cmdp.setOption("useStaticCondensation", "dontUseStaticCondensation", &useStaticCondensation);

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  double width = D, height = D, depth = D;

  VarFactory varFactory;
  // fields:
  VarPtr u = varFactory.fieldVar("u", L2);
  VarPtr sigma = varFactory.fieldVar("\\sigma", VECTOR_L2);

  FunctionPtr n = Function::normal();
  // traces:
  VarPtr u_hat;

  if (conformingTraces)
  {
    u_hat = varFactory.traceVar("\\widehat{u}", u);
  }
  else
  {
    cout << "Note: using non-conforming traces.\n";
    u_hat = varFactory.traceVar("\\widehat{u}", u, L2);
  }
  VarPtr sigma_n_hat = varFactory.fluxVar("\\widehat{\\sigma}_{n}", sigma * n);

  // test functions:
  VarPtr tau = varFactory.testVar("\\tau", HDIV);
  VarPtr v = varFactory.testVar("v", HGRAD);

  BFPtr poissonBF = Teuchos::rcp( new BF(varFactory) );
  FunctionPtr alpha = Function::constant(1); // viscosity

  // tau terms:
  poissonBF->addTerm(sigma / alpha, tau);
  poissonBF->addTerm(-u, tau->div()); // (sigma1, tau1)
  poissonBF->addTerm(u_hat, tau * n);

  // v terms:
  poissonBF->addTerm(- sigma, v->grad()); // (mu sigma1, grad v1)
  poissonBF->addTerm( sigma_n_hat, v);

  int horizontalCells = numCells, verticalCells = numCells, depthCells = numCells;

  vector<double> domainDimensions;
  domainDimensions.push_back(width);
  domainDimensions.push_back(height);

  vector<int> elementCounts;
  elementCounts.push_back(horizontalCells);
  elementCounts.push_back(verticalCells);

  if (use3D)
  {
    domainDimensions.push_back(depth);
    elementCounts.push_back(depthCells);
  }

  MeshPtr mesh, k0Mesh;

  int H1Order = k + 1;
  int H1Order_coarse = k_coarse + 1;
  if (!use3D)
  {
    Teuchos::ParameterList pl;

    map<int,int> trialOrderEnhancements;

    if (enhanceUField)
    {
      trialOrderEnhancements[u->ID()] = 1;
    }

    BFPtr poissonBilinearForm = poissonBF;

    pl.set("useMinRule", true);
    pl.set("bf",poissonBilinearForm);
    pl.set("H1Order", H1Order);
    pl.set("delta_k", delta_k);
    pl.set("horizontalElements", horizontalCells);
    pl.set("verticalElements", verticalCells);
    pl.set("divideIntoTriangles", false);
    pl.set("useConformingTraces", conformingTraces);
    pl.set("trialOrderEnhancements", &trialOrderEnhancements);
    pl.set("x0",(double)0);
    pl.set("y0",(double)0);
    pl.set("width", width);
    pl.set("height",height);

    mesh = MeshFactory::quadMesh(pl);

    pl.set("H1Order", H1Order_coarse);
    k0Mesh = MeshFactory::quadMesh(pl);

  }
  else
  {
    mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order, delta_k);
    k0Mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order_coarse, delta_k);
  }

  mesh->registerObserver(k0Mesh); // ensure that the k0 mesh refinements track those of the solution mesh

  RHSPtr rhs = RHS::rhs(); // zero
  FunctionPtr sin_pi_x = Teuchos::rcp( new Sin_ax(PI/D) );
  FunctionPtr sin_pi_y = Teuchos::rcp( new Sin_ay(PI/D) );
  FunctionPtr u_exact = sin_pi_x * sin_pi_y;
  FunctionPtr f = -(2.0 * PI * PI / (D * D)) * sin_pi_x * sin_pi_y;
  rhs->addTerm( f * v );

  BCPtr bc = BC::bc();
  SpatialFilterPtr boundary = SpatialFilter::allSpace();

  bc->addDirichlet(u_hat, boundary, u_exact);

  IPPtr graphNorm;

  FunctionPtr h = Teuchos::rcp( new hFunction() );

  if (useWeightedGraphNorm)
  {
    graphNorm = IP::ip();
    graphNorm->addTerm( tau->div() ); // u
    graphNorm->addTerm( (h / alpha) * tau - h * v->grad() ); // sigma
    graphNorm->addTerm( v ); // boundary term (adjoint to u)
    graphNorm->addTerm( h * tau );

//    // new effort, with the idea that the test norm should be considered in reference space, basically
//    graphNorm = IP::ip();
//    graphNorm->addTerm( tau->div() ); // u
//    graphNorm->addTerm( tau / h - v->grad() ); // sigma
//    graphNorm->addTerm( v / h ); // boundary term (adjoint to u)
//    graphNorm->addTerm( tau / h );
  }
  else
  {
    map<int, double> trialWeights; // on the squared terms in the trial space norm
    trialWeights[u->ID()] = 1.0 / (D * D);
    trialWeights[sigma->ID()] = 1.0;
    graphNorm = poissonBF->graphNorm(trialWeights, 1.0); // 1.0: weight on the L^2 terms
  }

  SolutionPtr solution = Solution::solution(mesh, bc, rhs, graphNorm);
  solution->setUseCondensedSolve(useStaticCondensation);

  mesh->registerSolution(solution); // sign up for projection of old solution onto refined cells.

  double energyThreshold = 0.2;
  RefinementStrategy refinementStrategy( solution, energyThreshold );

  refinementStrategy.setReportPerCellErrors(true);
  refinementStrategy.setEnforceOneIrregularity(enforceOneIrregularity);

  Teuchos::RCP<Solver> coarseSolver, fineSolver;
  if (useMumps)
  {
#ifdef HAVE_AMESOS_MUMPS
    coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#else
    cout << "useMumps=true, but MUMPS is not available!\n";
    exit(0);
#endif
  }
  else
  {
    coarseSolver = Teuchos::rcp( new KluSolver );
  }
  GMGSolver* gmgSolver;

  if (useGMGSolver)
  {
    double tol = relativeTol;
    int maxIters = gmgMaxIterations;
    BCPtr zeroBCs = bc->copyImposingZero();
    gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(),
                              solution->getPartitionMap(), maxIters, tol, coarseSolver,
                              useStaticCondensation);

    gmgSolver->setAztecOutput(AztecOutputLevel);
    gmgSolver->setUseConjugateGradient(true);
    gmgSolver->gmgOperator()->setSmootherType(GMGOperator::IFPACK_ADDITIVE_SCHWARZ);
    gmgSolver->gmgOperator()->setSmootherOverlap(smootherOverlap);

    fineSolver = Teuchos::rcp( gmgSolver );
  }
  else
  {
    fineSolver = coarseSolver;
  }

//  if (rank==0) cout << "experimentally starting by solving with MUMPS on the fine mesh.\n";
//  solution->solve( Teuchos::rcp( new MumpsSolver) );

  solution->solve(fineSolver);

#ifdef HAVE_EPETRAEXT_HDF5
  ostringstream dir_name;
  dir_name << "poissonCavityFlow_k" << k;
  HDF5Exporter exporter(mesh,dir_name.str());
  exporter.exportSolution(solution,varFactory,0);
#endif

#ifdef HAVE_AMESOS_MUMPS
  if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#endif

  solution->reportTimings();
  if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings();
  for (int refIndex=0; refIndex < refCount; refIndex++)
  {
    double energyError = solution->energyErrorTotal();
    GlobalIndexType numFluxDofs = mesh->numFluxDofs();
    if (rank==0)
    {
      cout << "Before refinement " << refIndex << ", energy error = " << energyError;
      cout << " (using " << numFluxDofs << " trace degrees of freedom)." << endl;
    }
    bool printToConsole = printRefinementDetails && (rank==0);
    refinementStrategy.refine(printToConsole);

    if (useStaticCondensation)
    {
      CondensedDofInterpreter* condensedDofInterpreter = dynamic_cast<CondensedDofInterpreter*>(solution->getDofInterpreter().get());
      if (condensedDofInterpreter != NULL)
      {
        condensedDofInterpreter->reinitialize();
      }
    }

    GlobalIndexType fineDofs = mesh->globalDofCount();
    GlobalIndexType coarseDofs = k0Mesh->globalDofCount();
    if (rank==0)
    {
      cout << "After refinement, coarse mesh has " << k0Mesh->numActiveElements() << " elements and " << coarseDofs << " dofs.\n";
      cout << "  Fine mesh has " << mesh->numActiveElements() << " elements and " << fineDofs << " dofs.\n";
    }

    if (!use3D)
    {
      ostringstream fineMeshLocation, coarseMeshLocation;
      fineMeshLocation << "poissonFineMesh_k" << k << "_ref" << refIndex;
      GnuPlotUtil::writeComputationalMeshSkeleton(fineMeshLocation.str(), mesh, true); // true: label cells
      coarseMeshLocation << "poissonCoarseMesh_k" << k << "_ref" << refIndex;
      GnuPlotUtil::writeComputationalMeshSkeleton(coarseMeshLocation.str(), k0Mesh, true); // true: label cells
    }

    if (useGMGSolver)   // create fresh fineSolver now that the meshes have changed:
    {
#ifdef HAVE_AMESOS_MUMPS
      if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#endif
      double tol = max(relativeTol * energyError, minTol);
      int maxIters = gmgMaxIterations;
      BCPtr zeroBCs = bc->copyImposingZero();
      gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(),
                                solution->getPartitionMap(), maxIters, tol, coarseSolver, useStaticCondensation);
      gmgSolver->setAztecOutput(AztecOutputLevel);
      gmgSolver->setUseDiagonalScaling(useDiagonalScaling);
      fineSolver = Teuchos::rcp( gmgSolver );
    }

    solution->solve(fineSolver);
    solution->reportTimings();
    if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings();

#ifdef HAVE_EPETRAEXT_HDF5
    exporter.exportSolution(solution,varFactory,refIndex+1);
#endif
  }
  double energyErrorTotal = solution->energyErrorTotal();

  GlobalIndexType numFluxDofs = mesh->numFluxDofs();
  GlobalIndexType numGlobalDofs = mesh->numGlobalDofs();
  if (rank==0)
  {
    cout << "Final mesh has " << mesh->numActiveElements() << " elements and " << numFluxDofs << " trace dofs (";
    cout << numGlobalDofs << " total dofs, including fields).\n";
    cout << "Final energy error: " << energyErrorTotal << endl;
  }

#ifdef HAVE_EPETRAEXT_HDF5
  exporter.exportSolution(solution,varFactory,0);
#endif

  if (!use3D)
  {
    GnuPlotUtil::writeComputationalMeshSkeleton("poissonRefinedMesh", mesh, true);
  }

  coarseSolver = Teuchos::rcp((Solver*) NULL); // without this when useMumps = true and running on one rank, we see a crash on exit, which may have to do with MPI being finalized before coarseSolver is deleted.

  return 0;
}
int main(int argc, char *argv[])
{

#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);

  Epetra_MpiComm Comm(MPI_COMM_WORLD);
#else
  Epetra_SerialComm Comm;
#endif

  int commRank = Teuchos::GlobalMPISession::getRank();

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  // problem parameters:
  int spaceDim = 2;
  double Re = 40;
  bool steady = false;
  string problemChoice = "TaylorGreen";
  int numRefs = 1;
  int p = 2, delta_p = 2;
  int numXElems = 1;
  int numTElems = 1;
  int numSlabs = 1;
  bool useConformingTraces = false;
  string solverChoice = "KLU";
  string multigridStrategyString = "W-cycle";
  bool useCondensedSolve = false;
  bool useConjugateGradient = true;
  bool logFineOperator = false;
  // double solverTolerance = 1e-8;
  double nonlinearTolerance = 1e-5;
  // int maxLinearIterations = 10000;
  int maxNonlinearIterations = 20;
  int cgMaxIterations = 10000;
  double cgTol = 1e-8;
  bool computeL2Error = false;
  bool exportSolution = false;
  bool saveSolution = false;
  bool loadSolution = false;
  int loadRef = 0;
  int loadDirRef = 0;
  string norm = "Graph";
  string rootDir = ".";
  string tag="";
  cmdp.setOption("spaceDim", &spaceDim, "spatial dimension");
  cmdp.setOption("Re", &Re, "Re");
  cmdp.setOption("steady", "transient", &steady, "use steady incompressible Navier-Stokes");
  cmdp.setOption("problem", &problemChoice, "Kovasznay, TaylorGreen");
  cmdp.setOption("polyOrder",&p,"polynomial order for field variable u");
  cmdp.setOption("delta_p", &delta_p, "test space polynomial order enrichment");
  cmdp.setOption("numRefs",&numRefs,"number of refinements");
  cmdp.setOption("numXElems",&numXElems,"number of elements in x direction");
  cmdp.setOption("numTElems",&numTElems,"number of elements in t direction");
  cmdp.setOption("numSlabs",&numSlabs,"number of time slabs to use");
  cmdp.setOption("norm", &norm, "norm");
  cmdp.setOption("conformingTraces", "nonconformingTraces", &useConformingTraces, "use conforming traces");
  cmdp.setOption("solver", &solverChoice, "KLU, SuperLU, MUMPS, GMG-Direct, GMG-ILU, GMG-IC");
  cmdp.setOption("multigridStrategy", &multigridStrategyString, "Multigrid strategy: V-cycle, W-cycle, Full, or Two-level");
  cmdp.setOption("useCondensedSolve", "useStandardSolve", &useCondensedSolve);
  cmdp.setOption("CG", "GMRES", &useConjugateGradient);
  cmdp.setOption("logFineOperator", "dontLogFineOperator", &logFineOperator);
  // cmdp.setOption("solverTolerance", &solverTolerance, "iterative solver tolerance");
  cmdp.setOption("nonlinearTolerance", &nonlinearTolerance, "nonlinear solver tolerance");
  // cmdp.setOption("maxLinearIterations", &maxLinearIterations, "maximum number of iterations for linear solver");
  cmdp.setOption("maxNonlinearIterations", &maxNonlinearIterations, "maximum number of iterations for Newton solver");
  cmdp.setOption("exportDir", &rootDir, "export directory");
  cmdp.setOption("computeL2Error", "skipL2Error", &computeL2Error, "compute L2 error");
  cmdp.setOption("exportSolution", "skipExport", &exportSolution, "export solution to HDF5");
  cmdp.setOption("saveSolution", "skipSave", &saveSolution, "save mesh and solution to HDF5");
  cmdp.setOption("loadSolution", "skipLoad", &loadSolution, "load mesh and solution from HDF5");
  cmdp.setOption("loadRef", &loadRef, "load refinement number");
  cmdp.setOption("loadDirRef", &loadDirRef, "which refinement directory to load from");
  cmdp.setOption("tag", &tag, "output tag");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  map<string, Teuchos::RCP<IncompressibleProblem>> problems;
  problems["ManufacturedSolution"] = Teuchos::rcp(new IncompressibleManufacturedSolution(steady, Re, numXElems));
  problems["Kovasznay"] = Teuchos::rcp(new KovasznayProblem(steady, Re));
  problems["TaylorGreen"] = Teuchos::rcp(new TaylorGreenProblem(steady, Re, numXElems, numSlabs));
  problems["Cylinder"] = Teuchos::rcp(new CylinderProblem(steady, Re, numSlabs));
  problems["SquareCylinder"] = Teuchos::rcp(new SquareCylinderProblem(steady, Re, numSlabs));
  Teuchos::RCP<IncompressibleProblem> problem = problems.at(problemChoice);

  // if (commRank == 0)
  // {
  //   Solver::printAvailableSolversReport();
  //   cout << endl;
  // }
  Teuchos::RCP<Time> totalTimer = Teuchos::TimeMonitor::getNewCounter("Total Time");
  totalTimer->start(true);

  for (; problem->currentStep() < problem->numSlabs(); problem->advanceStep())
  {
    if (problem->numSlabs() > 1 && commRank == 0 && !steady)
      cout << "Solving time slab [" << problem->currentT0() << ", " << problem->currentT1() << "]" << endl;

    ostringstream problemName;
    string isSteady = "Steady";
    if (!steady)
      isSteady = "Transient";
    problemName << isSteady << problemChoice << spaceDim << "D_slab" << problem->currentStep() << "_" << norm << "_" << Re << "_p" << p << "_" << solverChoice;
    if (tag != "")
      problemName << "_" << tag;
    ostringstream saveDir;
    saveDir << problemName.str() << "_ref" << loadRef;

    int success = mkdir((rootDir+"/"+saveDir.str()).c_str(), S_IRWXU | S_IRWXG);

    string dataFileLocation = rootDir + "/" + saveDir.str() + "/" + saveDir.str() + ".data";
    string exportName = saveDir.str();

    ostringstream loadDir;
    loadDir << problemName.str() << "_ref" << loadDirRef;
    string loadFilePrefix = "";
    if (loadSolution)
    {
      loadFilePrefix = rootDir + "/" + loadDir.str() + "/" + saveDir.str();
      if (commRank == 0) cout << "Loading previous solution " << loadFilePrefix << endl;
    }
    // ostringstream saveDir;
    // saveDir << problemName.str() << "_ref" << loadRef;
    string saveFilePrefix = rootDir + "/" + saveDir.str() + "/" + problemName.str();
    if (saveSolution && commRank == 0) cout << "Saving to " << saveFilePrefix << endl;

    Teuchos::ParameterList parameters;
    parameters.set("spaceDim", spaceDim);
    parameters.set("steady", steady);
    parameters.set("mu", 1./Re);
    parameters.set("useConformingTraces", useConformingTraces);
    parameters.set("fieldPolyOrder", p);
    parameters.set("delta_p", delta_p);
    parameters.set("numTElems", numTElems);
    parameters.set("norm", norm);
    parameters.set("savedSolutionAndMeshPrefix", loadFilePrefix);
    SpaceTimeIncompressibleFormulationPtr form = Teuchos::rcp(new SpaceTimeIncompressibleFormulation(problem, parameters));

    MeshPtr mesh = form->solutionUpdate()->mesh();
    vector<MeshPtr> meshesCoarseToFine;
    MeshPtr k0Mesh = Teuchos::rcp( new Mesh (mesh->getTopology()->deepCopy(), form->bf(), 1, delta_p) );
    meshesCoarseToFine.push_back(k0Mesh);
    meshesCoarseToFine.push_back(mesh);
    // mesh->registerObserver(k0Mesh);

    // Set up boundary conditions
    problem->setBCs(form);

    // Set up solution
    SolutionPtr solutionUpdate = form->solutionUpdate();
    SolutionPtr solutionBackground = form->solutionBackground();
    // dynamic_cast<AnalyticalIncompressibleProblem*>(problem.get())->projectExactSolution(solutionBackground);

    RefinementStrategyPtr refStrategy = form->getRefinementStrategy();
    Teuchos::RCP<HDF5Exporter> exporter;
    if (exportSolution)
      exporter = Teuchos::rcp(new HDF5Exporter(mesh,exportName, rootDir));

    Teuchos::RCP<Time> solverTime = Teuchos::TimeMonitor::getNewCounter("Solve Time");
    map<string, SolverPtr> solvers;
    solvers["KLU"] = Solver::getSolver(Solver::KLU, true);
#if defined(HAVE_AMESOS_SUPERLUDIST) || defined(HAVE_AMESOS2_SUPERLUDIST)
    solvers["SuperLUDist"] = Solver::getSolver(Solver::SuperLUDist, true);
#endif
#ifdef HAVE_AMESOS_MUMPS
    solvers["MUMPS"] = Solver::getSolver(Solver::MUMPS, true);
#endif
    bool useStaticCondensation = false;

    GMGOperator::MultigridStrategy multigridStrategy;
    if (multigridStrategyString == "Two-level")
    {
      multigridStrategy = GMGOperator::TWO_LEVEL;
    }
    else if (multigridStrategyString == "W-cycle")
    {
      multigridStrategy = GMGOperator::W_CYCLE;
    }
    else if (multigridStrategyString == "V-cycle")
    {
      multigridStrategy = GMGOperator::V_CYCLE;
    }
    else if (multigridStrategyString == "Full-V")
    {
      multigridStrategy = GMGOperator::FULL_MULTIGRID_V;
    }
    else if (multigridStrategyString == "Full-W")
    {
      multigridStrategy = GMGOperator::FULL_MULTIGRID_W;
    }
    else
    {
      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "unrecognized multigrid strategy");
    }

    ofstream dataFile(dataFileLocation);
    dataFile << "ref\t " << "elements\t " << "dofs\t " << "energy\t " << "l2\t " << "solvetime\t" << "iterations\t " << endl;

    // {
    //   // ostringstream saveFile;
    //   // saveFile << saveFilePrefix << "_ref" << -1;
    //   // form->save(saveFile.str());
    //   exporter->exportSolution(solutionBackground, -1);
    //   if (commRank == 0)
    //     cout << "Done exporting" << endl;
    // }

    for (int refIndex=loadRef; refIndex <= numRefs; refIndex++)
    {
      double l2Update = 1e10;
      int iterCount = 0;
      solverTime->start(true);
      Teuchos::RCP<GMGSolver> gmgSolver;
      if (solverChoice[0] == 'G')
      {
        // gmgSolver = Teuchos::rcp( new GMGSolver(solutionUpdate, k0Mesh, maxLinearIterations, solverTolerance, Solver::getDirectSolver(true), useStaticCondensation));
        bool reuseFactorization = true;
        SolverPtr coarseSolver = Solver::getDirectSolver(reuseFactorization);
        gmgSolver = Teuchos::rcp(new GMGSolver(solutionUpdate, meshesCoarseToFine, cgMaxIterations, cgTol, multigridStrategy, coarseSolver, useCondensedSolve));
        gmgSolver->setUseConjugateGradient(useConjugateGradient);
        int azOutput = 20; // print residual every 20 CG iterations
        gmgSolver->setAztecOutput(azOutput);
        gmgSolver->gmgOperator()->setNarrateOnRankZero(logFineOperator,"finest GMGOperator");

        // gmgSolver->setAztecOutput(azOutput);
        // if (solverChoice == "GMG-Direct")
        //   gmgSolver->gmgOperator()->setSchwarzFactorizationType(GMGOperator::Direct);
        // if (solverChoice == "GMG-ILU")
        //   gmgSolver->gmgOperator()->setSchwarzFactorizationType(GMGOperator::ILU);
        // if (solverChoice == "GMG-IC")
        //   gmgSolver->gmgOperator()->setSchwarzFactorizationType(GMGOperator::IC);
      }
      while (l2Update > nonlinearTolerance && iterCount < maxNonlinearIterations)
      {
        if (solverChoice[0] == 'G')
          solutionUpdate->solve(gmgSolver);
        else
          solutionUpdate->condensedSolve(solvers[solverChoice]);

        // Compute L2 norm of update
        double u1L2Update = solutionUpdate->L2NormOfSolutionGlobal(form->u(1)->ID());
        double u2L2Update = solutionUpdate->L2NormOfSolutionGlobal(form->u(2)->ID());
        l2Update = sqrt(u1L2Update*u1L2Update + u2L2Update*u2L2Update);
        if (commRank == 0)
          cout << "Nonlinear Update:\t " << l2Update << endl;

        form->updateSolution();
        iterCount++;
      }
      double solveTime = solverTime->stop();

      double energyError = solutionUpdate->energyErrorTotal();
      double l2Error = 0;
      if (computeL2Error)
      {
        l2Error = problem->computeL2Error(form, solutionBackground);
      }
      if (commRank == 0)
      {
        cout << "Refinement: " << refIndex
          << " \tElements: " << mesh->numActiveElements()
          << " \tDOFs: " << mesh->numGlobalDofs()
          << " \tEnergy Error: " << energyError
          << " \tL2 Error: " << l2Error
          << " \tSolve Time: " << solveTime
          << " \tTotal Time: " << totalTimer->totalElapsedTime(true)
          // << " \tIteration Count: " << iterationCount
          << endl;
        dataFile << refIndex
          << " " << mesh->numActiveElements()
          << " " << mesh->numGlobalDofs()
          << " " << energyError
          << " " << l2Error
          << " " << solveTime
          << " " << totalTimer->totalElapsedTime(true)
          // << " " << iterationCount
          << endl;
      }

      if (exportSolution)
        exporter->exportSolution(solutionBackground, refIndex);

      if (saveSolution)
      {
        ostringstream saveFile;
        saveFile << saveFilePrefix << "_ref" << refIndex;
        form->save(saveFile.str());
      }

      if (refIndex != numRefs)
      {
        // k0Mesh = Teuchos::rcp( new Mesh (mesh->getTopology()->deepCopy(), form->bf(), 1, delta_p) );
        // meshesCoarseToFine.push_back(k0Mesh);
        refStrategy->refine();
        meshesCoarseToFine.push_back(mesh);
      }
    }
    dataFile.close();
  }
  double totalTime = totalTimer->stop();
  if (commRank == 0)
    cout << "Total time = " << totalTime << endl;

  return 0;
}
int main(int argc, char *argv[])
{
  int rank = 0;
#ifdef HAVE_MPI
  // TODO: figure out the right thing to do here...
  // may want to modify argc and argv before we make the following call:
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
  rank=mpiSession.getRank();
#else
#endif
  bool useLineSearch = false;

  int pToAdd = 2; // for optimal test function approximation
  int pToAddForStreamFunction = 2;
  double nonlinearStepSize = 1.0;
  double dt = 0.5;
  double nonlinearRelativeEnergyTolerance = 0.015; // used to determine convergence of the nonlinear solution
  //  double nonlinearRelativeEnergyTolerance = 0.15; // used to determine convergence of the nonlinear solution
  double eps = 1.0/64.0; // width of ramp up to 1.0 for top BC;  eps == 0 ==> soln not in H1
  // epsilon above is chosen to match our initial 16x16 mesh, to avoid quadrature errors.
  //  double eps = 0.0; // John Evans's problem: not in H^1
  bool enforceLocalConservation = false;
  bool enforceOneIrregularity = true;
  bool reportPerCellErrors  = true;
  bool useMumps = true;

  int horizontalCells, verticalCells;

  int maxIters = 50; // for nonlinear steps

  vector<double> ReValues;

  // usage: polyOrder [numRefinements]
  // parse args:
  if (argc < 6)
  {
    cout << "Usage: NavierStokesCavityFlowContinuationFixedMesh fieldPolyOrder hCells vCells energyErrorGoal Re0 [Re1 ...]\n";
    return -1;
  }
  int polyOrder = atoi(argv[1]);
  horizontalCells = atoi(argv[2]);
  verticalCells = atoi(argv[3]);
  double energyErrorGoal = atof(argv[4]);
  for (int i=5; i<argc; i++)
  {
    ReValues.push_back(atof(argv[i]));
  }
  if (rank == 0)
  {
    cout << "L^2 order: " << polyOrder << endl;
    cout << "initial mesh size: " << horizontalCells << " x " << verticalCells << endl;
    cout << "energy error goal: " << energyErrorGoal << endl;
    cout << "Reynolds number values for continuation:\n";
    for (int i=0; i<ReValues.size(); i++)
    {
      cout << ReValues[i] << ", ";
    }
    cout << endl;
  }

  FieldContainer<double> quadPoints(4,2);

  quadPoints(0,0) = 0.0; // x1
  quadPoints(0,1) = 0.0; // y1
  quadPoints(1,0) = 1.0;
  quadPoints(1,1) = 0.0;
  quadPoints(2,0) = 1.0;
  quadPoints(2,1) = 1.0;
  quadPoints(3,0) = 0.0;
  quadPoints(3,1) = 1.0;

  // define meshes:
  int H1Order = polyOrder + 1;
  bool useTriangles = false;
  bool meshHasTriangles = useTriangles;

  double minL2Increment = 1e-8;

  // get variable definitions:
  VarFactory varFactory = VGPStokesFormulation::vgpVarFactory();
  u1 = varFactory.fieldVar(VGP_U1_S);
  u2 = varFactory.fieldVar(VGP_U2_S);
  sigma11 = varFactory.fieldVar(VGP_SIGMA11_S);
  sigma12 = varFactory.fieldVar(VGP_SIGMA12_S);
  sigma21 = varFactory.fieldVar(VGP_SIGMA21_S);
  sigma22 = varFactory.fieldVar(VGP_SIGMA22_S);
  p = varFactory.fieldVar(VGP_P_S);

  u1hat = varFactory.traceVar(VGP_U1HAT_S);
  u2hat = varFactory.traceVar(VGP_U2HAT_S);
  t1n = varFactory.fluxVar(VGP_T1HAT_S);
  t2n = varFactory.fluxVar(VGP_T2HAT_S);

  v1 = varFactory.testVar(VGP_V1_S, HGRAD);
  v2 = varFactory.testVar(VGP_V2_S, HGRAD);
  tau1 = varFactory.testVar(VGP_TAU1_S, HDIV);
  tau2 = varFactory.testVar(VGP_TAU2_S, HDIV);
  q = varFactory.testVar(VGP_Q_S, HGRAD);

  FunctionPtr u1_0 = Teuchos::rcp( new U1_0(eps) );
  FunctionPtr u2_0 = Teuchos::rcp( new U2_0 );
  FunctionPtr zero = Function::zero();
  ParameterFunctionPtr Re_param = ParameterFunction::parameterFunction(1);
  VGPNavierStokesProblem problem = VGPNavierStokesProblem(Re_param,quadPoints,
                                   horizontalCells,verticalCells,
                                   H1Order, pToAdd,
                                   u1_0, u2_0,  // BC for u
                                   zero, zero); // zero forcing function
  SolutionPtr solution = problem.backgroundFlow();
  SolutionPtr solnIncrement = problem.solutionIncrement();

  Teuchos::RCP<Mesh> mesh = problem.mesh();
  mesh->registerSolution(solution);
  mesh->registerSolution(solnIncrement);

  ///////////////////////////////////////////////////////////////////////////

  // define bilinear form for stream function:
  VarFactory streamVarFactory;
  VarPtr phi_hat = streamVarFactory.traceVar("\\widehat{\\phi}");
  VarPtr psin_hat = streamVarFactory.fluxVar("\\widehat{\\psi}_n");
  VarPtr psi_1 = streamVarFactory.fieldVar("\\psi_1");
  VarPtr psi_2 = streamVarFactory.fieldVar("\\psi_2");
  VarPtr phi = streamVarFactory.fieldVar("\\phi");
  VarPtr q_s = streamVarFactory.testVar("q_s", HGRAD);
  VarPtr v_s = streamVarFactory.testVar("v_s", HDIV);
  BFPtr streamBF = Teuchos::rcp( new BF(streamVarFactory) );
  streamBF->addTerm(psi_1, q_s->dx());
  streamBF->addTerm(psi_2, q_s->dy());
  streamBF->addTerm(-psin_hat, q_s);

  streamBF->addTerm(psi_1, v_s->x());
  streamBF->addTerm(psi_2, v_s->y());
  streamBF->addTerm(phi, v_s->div());
  streamBF->addTerm(-phi_hat, v_s->dot_normal());

  Teuchos::RCP<Mesh> streamMesh, overkillMesh;

  streamMesh = MeshFactory::buildQuadMesh(quadPoints, horizontalCells, verticalCells,
                                          streamBF, H1Order+pToAddForStreamFunction,
                                          H1Order+pToAdd+pToAddForStreamFunction, useTriangles);

  mesh->registerObserver(streamMesh); // will refine streamMesh in the same way as mesh.

  map<int, double> dofsToL2error; // key: numGlobalDofs, value: total L2error compared with overkill
  vector< VarPtr > fields;
  fields.push_back(u1);
  fields.push_back(u2);
  fields.push_back(sigma11);
  fields.push_back(sigma12);
  fields.push_back(sigma21);
  fields.push_back(sigma22);
  fields.push_back(p);

  if (rank == 0)
  {
    cout << "Starting mesh has " << horizontalCells << " x " << verticalCells << " elements and ";
    cout << mesh->numGlobalDofs() << " total dofs.\n";
    cout << "polyOrder = " << polyOrder << endl;
    cout << "pToAdd = " << pToAdd << endl;
    cout << "eps for top BC = " << eps << endl;

    if (useTriangles)
    {
      cout << "Using triangles.\n";
    }
    if (enforceLocalConservation)
    {
      cout << "Enforcing local conservation.\n";
    }
    else
    {
      cout << "NOT enforcing local conservation.\n";
    }
    if (enforceOneIrregularity)
    {
      cout << "Enforcing 1-irregularity.\n";
    }
    else
    {
      cout << "NOT enforcing 1-irregularity.\n";
    }
  }

  ////////////////////   CREATE BCs   ///////////////////////
  SpatialFilterPtr entireBoundary = Teuchos::rcp( new SpatialFilterUnfiltered );

  FunctionPtr u1_prev = Function::solution(u1,solution);
  FunctionPtr u2_prev = Function::solution(u2,solution);

  FunctionPtr u1hat_prev = Function::solution(u1hat,solution);
  FunctionPtr u2hat_prev = Function::solution(u2hat,solution);


  ////////////////////   SOLVE & REFINE   ///////////////////////

  FunctionPtr vorticity = Teuchos::rcp( new PreviousSolutionFunction(solution, - u1->dy() + u2->dx() ) );
  //  FunctionPtr vorticity = Teuchos::rcp( new PreviousSolutionFunction(solution,sigma12 - sigma21) );
  RHSPtr streamRHS = RHS::rhs();
  streamRHS->addTerm(vorticity * q_s);
  ((PreviousSolutionFunction*) vorticity.get())->setOverrideMeshCheck(true);
  ((PreviousSolutionFunction*) u1_prev.get())->setOverrideMeshCheck(true);
  ((PreviousSolutionFunction*) u2_prev.get())->setOverrideMeshCheck(true);

  BCPtr streamBC = BC::bc();
  //  streamBC->addDirichlet(psin_hat, entireBoundary, u0_cross_n);
  streamBC->addDirichlet(phi_hat, entireBoundary, zero);
  //  streamBC->addZeroMeanConstraint(phi);

  IPPtr streamIP = Teuchos::rcp( new IP );
  streamIP->addTerm(q_s);
  streamIP->addTerm(q_s->grad());
  streamIP->addTerm(v_s);
  streamIP->addTerm(v_s->div());
  SolutionPtr streamSolution = Teuchos::rcp( new Solution( streamMesh, streamBC, streamRHS, streamIP ) );

  if (enforceLocalConservation)
  {
    FunctionPtr zero = Function::zero();
    solution->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y()==zero);
    solnIncrement->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y()==zero);
  }

  if (true)
  {
    FunctionPtr u1_incr = Function::solution(u1, solnIncrement);
    FunctionPtr u2_incr = Function::solution(u2, solnIncrement);
    FunctionPtr sigma11_incr = Function::solution(sigma11, solnIncrement);
    FunctionPtr sigma12_incr = Function::solution(sigma12, solnIncrement);
    FunctionPtr sigma21_incr = Function::solution(sigma21, solnIncrement);
    FunctionPtr sigma22_incr = Function::solution(sigma22, solnIncrement);
    FunctionPtr p_incr = Function::solution(p, solnIncrement);

    FunctionPtr l2_incr = u1_incr * u1_incr + u2_incr * u2_incr + p_incr * p_incr
                          + sigma11_incr * sigma11_incr + sigma12_incr * sigma12_incr
                          + sigma21_incr * sigma21_incr + sigma22_incr * sigma22_incr;

    double energyThreshold = 0.20;
    Teuchos::RCP< RefinementStrategy > refinementStrategy = Teuchos::rcp( new RefinementStrategy( solnIncrement, energyThreshold ));

    for (int i=0; i<ReValues.size(); i++)
    {
      double Re = ReValues[i];
      Re_param->setValue(Re);
      if (rank==0) cout << "Solving with Re = " << Re << ":\n";
      double energyErrorTotal;
      do
      {
        double incr_norm;
        do
        {
          problem.iterate(useLineSearch);
          incr_norm = sqrt(l2_incr->integrate(problem.mesh()));
          if (rank==0)
          {
            cout << "\x1B[2K"; // Erase the entire current line.
            cout << "\x1B[0E"; // Move to the beginning of the current line.
            cout << "Iteration: " << problem.iterationCount() << "; L^2(incr) = " << incr_norm;
            flush(cout);
          }
        }
        while ((incr_norm > minL2Increment ) && (problem.iterationCount() < maxIters));
        if (rank==0) cout << endl;
        problem.setIterationCount(1); // 1 means reuse background flow (which we must, given that we want continuation in Re...)
        energyErrorTotal = solnIncrement->energyErrorTotal(); //solution->energyErrorTotal();
        if (energyErrorTotal > energyErrorGoal)
        {
          refinementStrategy->refine(false);
        }
        if (rank==0)
        {
          cout << "Energy error: " << energyErrorTotal << endl;
        }
      }
      while (energyErrorTotal > energyErrorGoal);
    }
  }

  double energyErrorTotal = solution->energyErrorTotal();
  double incrementalEnergyErrorTotal = solnIncrement->energyErrorTotal();
  if (rank == 0)
  {
    cout << "final mesh has " << mesh->numActiveElements() << " elements and " << mesh->numGlobalDofs() << " dofs.\n";
    cout << "energy error: " << energyErrorTotal << endl;
    cout << "  (Incremental solution's energy error is " << incrementalEnergyErrorTotal << ".)\n";
  }

  FunctionPtr u1_sq = u1_prev * u1_prev;
  FunctionPtr u_dot_u = u1_sq + (u2_prev * u2_prev);
  FunctionPtr u_mag = Teuchos::rcp( new SqrtFunction( u_dot_u ) );
  FunctionPtr u_div = Teuchos::rcp( new PreviousSolutionFunction(solution, u1->dx() + u2->dy() ) );
  FunctionPtr massFlux = Teuchos::rcp( new PreviousSolutionFunction(solution, u1hat->times_normal_x() + u2hat->times_normal_y()) );

  // check that the zero mean pressure is being correctly imposed:
  FunctionPtr p_prev = Teuchos::rcp( new PreviousSolutionFunction(solution,p) );
  double p_avg = p_prev->integrate(mesh);
  if (rank==0)
    cout << "Integral of pressure: " << p_avg << endl;

  // integrate massFlux over each element (a test):
  // fake a new bilinear form so we can integrate against 1
  VarPtr testOne = varFactory.testVar("1",CONSTANT_SCALAR);
  BFPtr fakeBF = Teuchos::rcp( new BF(varFactory) );
  LinearTermPtr massFluxTerm = massFlux * testOne;

  CellTopoPtrLegacy quadTopoPtr = Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData<shards::Quadrilateral<4> >() ));
  DofOrderingFactory dofOrderingFactory(fakeBF);
  int fakeTestOrder = H1Order;
  DofOrderingPtr testOrdering = dofOrderingFactory.testOrdering(fakeTestOrder, *quadTopoPtr);

  int testOneIndex = testOrdering->getDofIndex(testOne->ID(),0);
  vector< ElementTypePtr > elemTypes = mesh->elementTypes(); // global element types
  map<int, double> massFluxIntegral; // cellID -> integral
  double maxMassFluxIntegral = 0.0;
  double totalMassFlux = 0.0;
  double totalAbsMassFlux = 0.0;
  double maxCellMeasure = 0;
  double minCellMeasure = 1;
  for (vector< ElementTypePtr >::iterator elemTypeIt = elemTypes.begin(); elemTypeIt != elemTypes.end(); elemTypeIt++)
  {
    ElementTypePtr elemType = *elemTypeIt;
    vector< ElementPtr > elems = mesh->elementsOfTypeGlobal(elemType);
    vector<GlobalIndexType> cellIDs;
    for (int i=0; i<elems.size(); i++)
    {
      cellIDs.push_back(elems[i]->cellID());
    }
    FieldContainer<double> physicalCellNodes = mesh->physicalCellNodesGlobal(elemType);
    BasisCachePtr basisCache = Teuchos::rcp( new BasisCache(elemType,mesh,polyOrder) ); // enrich by trial space order
    basisCache->setPhysicalCellNodes(physicalCellNodes,cellIDs,true); // true: create side caches
    FieldContainer<double> cellMeasures = basisCache->getCellMeasures();
    FieldContainer<double> fakeRHSIntegrals(elems.size(),testOrdering->totalDofs());
    massFluxTerm->integrate(fakeRHSIntegrals,testOrdering,basisCache,true); // true: force side evaluation
    //      cout << "fakeRHSIntegrals:\n" << fakeRHSIntegrals;
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      // pick out the ones for testOne:
      massFluxIntegral[cellID] = fakeRHSIntegrals(i,testOneIndex);
    }
    // find the largest:
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      maxMassFluxIntegral = max(abs(massFluxIntegral[cellID]), maxMassFluxIntegral);
    }
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      maxCellMeasure = max(maxCellMeasure,cellMeasures(i));
      minCellMeasure = min(minCellMeasure,cellMeasures(i));
      maxMassFluxIntegral = max(abs(massFluxIntegral[cellID]), maxMassFluxIntegral);
      totalMassFlux += massFluxIntegral[cellID];
      totalAbsMassFlux += abs( massFluxIntegral[cellID] );
    }
  }
  if (rank==0)
  {
    cout << "largest mass flux: " << maxMassFluxIntegral << endl;
    cout << "total mass flux: " << totalMassFlux << endl;
    cout << "sum of mass flux absolute value: " << totalAbsMassFlux << endl;
    cout << "largest h: " << sqrt(maxCellMeasure) << endl;
    cout << "smallest h: " << sqrt(minCellMeasure) << endl;
    cout << "ratio of largest / smallest h: " << sqrt(maxCellMeasure) / sqrt(minCellMeasure) << endl;
  }
  if (rank == 0)
  {
    cout << "phi ID: " << phi->ID() << endl;
    cout << "psi1 ID: " << psi_1->ID() << endl;
    cout << "psi2 ID: " << psi_2->ID() << endl;

    cout << "streamMesh has " << streamMesh->numActiveElements() << " elements.\n";
    cout << "solving for approximate stream function...\n";
  }

  streamSolution->solve(useMumps);
  energyErrorTotal = streamSolution->energyErrorTotal();
  if (rank == 0)
  {
    cout << "...solved.\n";
    cout << "Stream mesh has energy error: " << energyErrorTotal << endl;
  }

  if (rank==0)
  {
    solution->writeToVTK("nsCavitySoln.vtk");
    if (! meshHasTriangles )
    {
      massFlux->writeBoundaryValuesToMATLABFile(solution->mesh(), "massFlux.dat");
      u_mag->writeValuesToMATLABFile(solution->mesh(), "u_mag.m");
      u_div->writeValuesToMATLABFile(solution->mesh(), "u_div.m");
      solution->writeFieldsToFile(u1->ID(), "u1.m");
      solution->writeFluxesToFile(u1hat->ID(), "u1_hat.dat");
      solution->writeFieldsToFile(u2->ID(), "u2.m");
      solution->writeFluxesToFile(u2hat->ID(), "u2_hat.dat");
      solution->writeFieldsToFile(p->ID(), "p.m");
      streamSolution->writeFieldsToFile(phi->ID(), "phi.m");

      streamSolution->writeFluxesToFile(phi_hat->ID(), "phi_hat.dat");
      streamSolution->writeFieldsToFile(psi_1->ID(), "psi1.m");
      streamSolution->writeFieldsToFile(psi_2->ID(), "psi2.m");
      vorticity->writeValuesToMATLABFile(streamMesh, "vorticity.m");

      FunctionPtr ten = Teuchos::rcp( new ConstantScalarFunction(10) );
      ten->writeBoundaryValuesToMATLABFile(solution->mesh(), "skeleton.dat");
      cout << "wrote files: u_mag.m, u_div.m, u1.m, u1_hat.dat, u2.m, u2_hat.dat, p.m, phi.m, vorticity.m.\n";
    }
    else
    {
      solution->writeToFile(u1->ID(), "u1.dat");
      solution->writeToFile(u2->ID(), "u2.dat");
      solution->writeToFile(u2->ID(), "p.dat");
      cout << "wrote files: u1.dat, u2.dat, p.dat\n";
    }

    FieldContainer<double> points = pointGrid(0, 1, 0, 1, 100);
    FieldContainer<double> pointData = solutionData(points, streamSolution, phi);
    GnuPlotUtil::writeXYPoints("phi_patch_navierStokes_cavity.dat", pointData);
    set<double> patchContourLevels = diagonalContourLevels(pointData,1);
    vector<string> patchDataPath;
    patchDataPath.push_back("phi_patch_navierStokes_cavity.dat");
    GnuPlotUtil::writeContourPlotScript(patchContourLevels, patchDataPath, "lidCavityNavierStokes.p");

    GnuPlotUtil::writeExactMeshSkeleton("lid_navierStokes_continuation_adaptive", mesh, 2);

    writePatchValues(0, 1, 0, 1, streamSolution, phi, "phi_patch.m");
    writePatchValues(0, .1, 0, .1, streamSolution, phi, "phi_patch_detail.m");
    writePatchValues(0, .01, 0, .01, streamSolution, phi, "phi_patch_minute_detail.m");
    writePatchValues(0, .001, 0, .001, streamSolution, phi, "phi_patch_minute_minute_detail.m");
  }

  return 0;
}
int main(int argc, char *argv[])
{

#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);

  Epetra_MpiComm Comm(MPI_COMM_WORLD);
#else
  Epetra_SerialComm Comm;
#endif

  int commRank = Teuchos::GlobalMPISession::getRank();

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  // problem parameters:
  double mu = 0.1;
  double permCoef = 1e4;
  int numRefs = 0;
  int k = 2, delta_k = 2;
  string norm = "Graph";
  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");
  cmdp.setOption("numRefs",&numRefs,"number of refinements");
  cmdp.setOption("norm", &norm, "norm");
  cmdp.setOption("mu", &mu, "mu");
  cmdp.setOption("permCoef", &permCoef, "Permeability coefficient");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  FunctionPtr zero = TFunction<double>::zero();
  FunctionPtr one = TFunction<double>::constant(1);
  FunctionPtr sin2pix = Teuchos::rcp( new Sin_ax(2*pi) );
  FunctionPtr cos2pix = Teuchos::rcp( new Cos_ax(2*pi) );
  FunctionPtr sin2piy = Teuchos::rcp( new Sin_ay(2*pi) );
  FunctionPtr cos2piy = Teuchos::rcp( new Cos_ay(2*pi) );
  FunctionPtr u1_exact = sin2pix*cos2piy;
  FunctionPtr u2_exact = -cos2pix*sin2piy;
  FunctionPtr x2 = TFunction<double>::xn(2);
  FunctionPtr y2 = TFunction<double>::yn(2);
  FunctionPtr p_exact = x2*y2 - 1./9;
  FunctionPtr permInv = permCoef*(sin2pix + 1.1);

  VarFactoryPtr vf = VarFactory::varFactory();
  //fields:
  VarPtr sigma1 = vf->fieldVar("sigma1", VECTOR_L2);
  VarPtr sigma2 = vf->fieldVar("sigma2", VECTOR_L2);
  VarPtr u1 = vf->fieldVar("u1", L2);
  VarPtr u2 = vf->fieldVar("u2", L2);
  VarPtr p = vf->fieldVar("p", L2);

  // traces:
  VarPtr u1hat = vf->traceVar("u1hat");
  VarPtr u2hat = vf->traceVar("u2hat");
  VarPtr t1c = vf->fluxVar("t1c");
  VarPtr t2c = vf->fluxVar("t2c");

  // test:
  VarPtr v1 = vf->testVar("v1", HGRAD);
  VarPtr v2 = vf->testVar("v2", HGRAD);
  VarPtr tau1 = vf->testVar("tau1", HDIV);
  VarPtr tau2 = vf->testVar("tau2", HDIV);
  VarPtr q = vf->testVar("q", HGRAD);

  BFPtr bf = Teuchos::rcp( new BF(vf) );

  bf->addTerm(1./mu*sigma1, tau1);
  bf->addTerm(1./mu*sigma2, tau2);
  bf->addTerm(u1, tau1->div());
  bf->addTerm(u2, tau2->div());
  bf->addTerm(-u1hat, tau1->dot_normal());
  bf->addTerm(-u2hat, tau2->dot_normal());

  bf->addTerm(sigma1, v1->grad());
  bf->addTerm(sigma2, v2->grad());
  bf->addTerm(-p, v1->dx());
  bf->addTerm(-p, v2->dy());
  bf->addTerm(t1c, v1);
  bf->addTerm(t2c, v2);
  bf->addTerm(mu*permInv*u1, v1);
  bf->addTerm(mu*permInv*u2, v2);

  bf->addTerm(-u1, q->dx());
  bf->addTerm(-u2, q->dy());
  bf->addTerm(u1hat, q->times_normal_x());
  bf->addTerm(u2hat, q->times_normal_y());

  RHSPtr rhs = RHS::rhs();

  BCPtr bc = BC::bc();

  SpatialFilterPtr y_equals_one = SpatialFilter::matchingY(1.0);
  SpatialFilterPtr y_equals_zero = SpatialFilter::matchingY(0);
  SpatialFilterPtr x_equals_one = SpatialFilter::matchingX(1.0);
  SpatialFilterPtr x_equals_zero = SpatialFilter::matchingX(0.0);
  bc->addDirichlet(u1hat, y_equals_zero, u1_exact);
  bc->addDirichlet(u2hat, y_equals_zero, u2_exact);
  bc->addDirichlet(u1hat, x_equals_zero, u1_exact);
  bc->addDirichlet(u2hat, x_equals_zero, u2_exact);
  bc->addDirichlet(u1hat, y_equals_one, u1_exact);
  bc->addDirichlet(u2hat, y_equals_one, u2_exact);
  bc->addDirichlet(u1hat, x_equals_one, u1_exact);
  bc->addDirichlet(u2hat, x_equals_one, u2_exact);
  bc->addZeroMeanConstraint(p);

  MeshPtr mesh = MeshFactory::quadMesh(bf, k+1, delta_k, 1, 1, 4, 4);

  map<string, IPPtr> brinkmanIPs;
  brinkmanIPs["Graph"] = bf->graphNorm();

  brinkmanIPs["Decoupled"] = Teuchos::rcp(new IP);
  brinkmanIPs["Decoupled"]->addTerm(tau1);
  brinkmanIPs["Decoupled"]->addTerm(tau2);
  brinkmanIPs["Decoupled"]->addTerm(tau1->div());
  brinkmanIPs["Decoupled"]->addTerm(tau2->div());
  brinkmanIPs["Decoupled"]->addTerm(permInv*v1);
  brinkmanIPs["Decoupled"]->addTerm(permInv*v2);
  brinkmanIPs["Decoupled"]->addTerm(v1->grad());
  brinkmanIPs["Decoupled"]->addTerm(v2->grad());
  brinkmanIPs["Decoupled"]->addTerm(q);
  brinkmanIPs["Decoupled"]->addTerm(q->grad());

  // brinkmanIPs["CoupledRobust"] = Teuchos::rcp(new IP);
  // brinkmanIPs["CoupledRobust"]->addTerm(tau->div()-beta*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(Function<double>::min(one/Function<double>::h(),Function<double>::constant(1./sqrt(epsilon)))*tau);
  // brinkmanIPs["CoupledRobust"]->addTerm(sqrt(epsilon)*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(beta*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(Function<double>::min(sqrt(epsilon)*one/Function<double>::h(),one)*v);

  IPPtr ip = brinkmanIPs[norm];

  SolutionPtr soln = TSolution<double>::solution(mesh, bc, rhs, ip);

  double threshold = 0.20;
  RefinementStrategy refStrategy(soln, threshold);

  ostringstream refName;
  refName << "brinkman";
  HDF5Exporter exporter(mesh,refName.str());

  for (int refIndex=0; refIndex <= numRefs; refIndex++)
  {
    soln->solve(false);

    double energyError = soln->energyErrorTotal();
    if (commRank == 0)
    {
      // if (refIndex > 0)
      // refStrategy.printRefinementStatistics(refIndex-1);
      cout << "Refinement:\t " << refIndex << " \tElements:\t " << mesh->numActiveElements()
           << " \tDOFs:\t " << mesh->numGlobalDofs() << " \tEnergy Error:\t " << energyError << endl;
    }

    exporter.exportSolution(soln, refIndex);

    if (refIndex != numRefs)
      refStrategy.refine();
  }

  return 0;
}
int main(int argc, char *argv[])
{
#ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS
  cout << "NOTE: enabling floating point exceptions for divide by zero.\n";
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif

  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
  int rank = Teuchos::GlobalMPISession::getRank();

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  const static double PI  = 3.141592653589793238462;

  bool useCondensedSolve = true; // condensed solve not yet compatible with minimum rule meshes

  int k = 2; // poly order for u in every direction, including temporal
  int numCells = 32; // in x, y
  int numTimeCells = 1;
  int numTimeSlabs = -1;
  int numFrames = 201;
  int delta_k = 3;   // test space enrichment: should be 3 for 3D
  int maxRefinements = 0; // maximum # of refinements on each time slab
  bool useMumpsIfAvailable  = true;
  bool useConstantConvection = false;
  double refinementTolerance = 0.1;

  int checkPointFrequency = 50; // output solution and mesh every 50 time slabs

  int previousSolutionTimeSlabNumber = -1;
  string previousSolutionFile = "";
  string previousMeshFile = "";

  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");

  cmdp.setOption("numCells",&numCells,"number of cells in x and y directions");
  cmdp.setOption("numTimeCells",&numTimeCells,"number of time axis cells");
  cmdp.setOption("numTimeSlabs",&numTimeSlabs,"number of time slabs");
  cmdp.setOption("numFrames",&numFrames,"number of frames for export");

  cmdp.setOption("useConstantConvection", "useVariableConvection", &useConstantConvection);

  cmdp.setOption("useCondensedSolve", "useUncondensedSolve", &useCondensedSolve, "use static condensation to reduce the size of the global solve");
  cmdp.setOption("useMumps", "useKLU", &useMumpsIfAvailable, "use MUMPS (if available)");

  cmdp.setOption("refinementTolerance", &refinementTolerance, "relative error beyond which to stop refining");
  cmdp.setOption("maxRefinements", &maxRefinements, "maximum # of refinements on each time slab");

  cmdp.setOption("previousSlabNumber", &previousSolutionTimeSlabNumber, "time slab number of previous solution");
  cmdp.setOption("previousSolution", &previousSolutionFile, "file with previous solution");
  cmdp.setOption("previousMesh", &previousMeshFile, "file with previous mesh");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  int H1Order = k + 1;

  VarFactory varFactory;
  // traces:
  VarPtr qHat = varFactory.fluxVar("\\widehat{q}");

  // fields:
  VarPtr u = varFactory.fieldVar("u", L2);

  // test functions:
  VarPtr v = varFactory.testVar("v", HGRAD);

  FunctionPtr x = Function::xn(1);
  FunctionPtr y = Function::yn(1);

  FunctionPtr c;
  if (useConstantConvection)
  {
    c = Function::vectorize(Function::constant(0.5), Function::constant(0.5), Function::constant(1.0));
  }
  else
  {
    c = Function::vectorize(y-0.5, 0.5-x, Function::constant(1.0));
  }
  FunctionPtr n = Function::normal();

  BFPtr bf = Teuchos::rcp( new BF(varFactory) );

  bf->addTerm( u, c * v->grad());
  bf->addTerm(qHat, v);

  double width = 2.0, height = 2.0;
  int horizontalCells = numCells, verticalCells = numCells;
  int depthCells = numTimeCells;
  double x0 = -0.5;
  double y0 = -0.5;
  double t0 = 0;

  double totalTime = 2.0 * PI;

  vector<double> frameTimes;
  for (int i=0; i<numFrames; i++)
  {
    frameTimes.push_back((totalTime*i) / (numFrames-1));
  }

  if (numTimeSlabs==-1)
  {
    // want the number of grid points in temporal direction to be about 2000.  The temporal length is 2 * PI
    numTimeSlabs = (int) 2000 / k;
  }
  double timeLengthPerSlab = totalTime / numTimeSlabs;

  if (rank==0)
  {
    cout << "solving on " << numCells << " x " << numCells << " x " << numTimeCells << " mesh " << "of order " << k << ".\n";
    cout << "numTimeSlabs: " << numTimeSlabs << endl;
  }

  SpatialFilterPtr inflowFilter  = Teuchos::rcp( new InflowFilterForClockwisePlanarRotation (x0,x0+width,y0,y0+height,0.5,0.5));

  vector<double> dimensions;
  dimensions.push_back(width);
  dimensions.push_back(height);
  dimensions.push_back(timeLengthPerSlab);

  vector<int> elementCounts(3);
  elementCounts[0] = horizontalCells;
  elementCounts[1] = verticalCells;
  elementCounts[2] = depthCells;

  vector<double> origin(3);
  origin[0] = x0;
  origin[1] = y0;
  origin[2] = t0;

  Teuchos::RCP<Solver> solver = Teuchos::rcp( new KluSolver );

#ifdef HAVE_AMESOS_MUMPS
  if (useMumpsIfAvailable) solver = Teuchos::rcp( new MumpsSolver );
#endif

//  double errorPercentage = 0.5; // for mesh refinements: ask to refine elements that account for 80% of the error in each step
//  Teuchos::RCP<RefinementStrategy> refinementStrategy;
//  refinementStrategy = Teuchos::rcp( new ErrorPercentageRefinementStrategy( soln, errorPercentage ));

  if (maxRefinements != 0)
  {
    cout << "Warning: maxRefinements is not 0, but the slice exporter implicitly assumes there won't be any refinements.\n";
  }

  MeshPtr mesh;

  MeshPtr prevMesh;
  SolutionPtr prevSoln;

  mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

  if (rank==0) cout << "Initial mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n";

  FunctionPtr sideParity = Function::sideParity();

  int lastFrameOutputted = -1;

  SolutionPtr soln;

  IPPtr ip;
  ip = bf->graphNorm();

  FunctionPtr u0 = Teuchos::rcp( new Cone_U0(0.0, 0.25, 0.1, 1.0, false) );

  BCPtr bc = BC::bc();
  bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
  bc->addDirichlet(qHat, SpatialFilter::matchingZ(t0), u0);

  MeshPtr initialMesh = mesh;

  int startingSlabNumber;
  if (previousSolutionTimeSlabNumber != -1)
  {
    startingSlabNumber = previousSolutionTimeSlabNumber + 1;

    if (rank==0) cout << "Loading mesh from " << previousMeshFile << endl;

    prevMesh = MeshFactory::loadFromHDF5(bf, previousMeshFile);
    prevSoln = Solution::solution(mesh, bc, RHS::rhs(), ip); // include BC and IP objects for sake of condensed dof interpreter setup...
    prevSoln->setUseCondensedSolve(useCondensedSolve);

    if (rank==0) cout << "Loading solution from " << previousSolutionFile << endl;
    prevSoln->loadFromHDF5(previousSolutionFile);

    double tn = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab;
    origin[2] = tn;
    mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

    FunctionPtr q_prev = Function::solution(qHat, prevSoln);
    FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, prevMesh, mesh, tn) ); // negate because the normals go in opposite directions

    bc = BC::bc();
    bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
    bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer);

    double t_slab_final = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab;
    int frameOrdinal = 0;

    while (frameTimes[frameOrdinal] < t_slab_final)
    {
      lastFrameOutputted = frameOrdinal++;
    }
  }
  else
  {
    startingSlabNumber = 0;
  }


#ifdef HAVE_EPETRAEXT_HDF5
  ostringstream dir_name;
  dir_name << "spacetime_slice_convectingCone_k" << k << "_startSlab" << startingSlabNumber;
  map<GlobalIndexType,GlobalIndexType> cellMap;
  MeshPtr meshSlice = MeshTools::timeSliceMesh(initialMesh, 0, cellMap, H1Order);
  HDF5Exporter sliceExporter(meshSlice,dir_name.str());
#endif

  soln = Solution::solution(mesh, bc, RHS::rhs(), ip);
  soln->setUseCondensedSolve(useCondensedSolve);

  for(int timeSlab = startingSlabNumber; timeSlab<numTimeSlabs; timeSlab++)
  {
    double energyThreshold = 0.2; // for mesh refinements: ask to refine elements that account for 80% of the error in each step
    Teuchos::RCP<RefinementStrategy> refinementStrategy;
    refinementStrategy = Teuchos::rcp( new RefinementStrategy( soln, energyThreshold ));

    FunctionPtr u_spacetime = Function::solution(u, soln);

    double relativeEnergyError;
    int refNumber = 0;

//    {
//      // DEBUGGING: just to try running the time slicing:
//      double t_slab_final = (timeStep+1) * timeLengthPerSlab;
//      int frameOrdinal = lastFrameOutputted + 1;
//      while (frameTimes[frameOrdinal] < t_slab_final) {
//        FunctionPtr u_spacetime = Function::solution(u, soln);
//        ostringstream dir_name;
//        dir_name << "spacetime_slice_convectingCone_k" << k;
//        MeshTools::timeSliceExport(dir_name.str(), mesh, u_spacetime, frameTimes[frameOrdinal], "u_slice");
//
//        cout << "Exported frame " << frameOrdinal << ", t=" << frameTimes[frameOrdinal] << endl;
//        frameOrdinal++;
//      }
//    }

    do
    {
      soln->solve(solver);
      soln->reportTimings();

#ifdef HAVE_EPETRAEXT_HDF5
      ostringstream dir_name;
      dir_name << "spacetime_convectingCone_k" << k << "_t" << timeSlab;
      HDF5Exporter exporter(soln->mesh(),dir_name.str());
      exporter.exportSolution(soln, varFactory);

      if (rank==0) cout << "Exported HDF solution for time slab to directory " << dir_name.str() << endl;
//      string u_name = "u_spacetime";
//      exporter.exportFunction(u_spacetime, u_name);

      ostringstream file_name;
      file_name << dir_name.str();

      bool saveSolutionAndMeshForThisSlab = ((timeSlab + 1) % checkPointFrequency == 0); // +1 so that first output is nth, not first
      if (saveSolutionAndMeshForThisSlab)
      {
        dir_name << ".soln";
        soln->saveToHDF5(dir_name.str());
        if (rank==0) cout << endl << "wrote " << dir_name.str() << endl;

        file_name << ".mesh";
        soln->mesh()->saveToHDF5(file_name.str());
      }
#endif
      FunctionPtr u_soln = Function::solution(u, soln);

      double solnNorm = u_soln->l2norm(mesh);

      double energyError = soln->energyErrorTotal();
      relativeEnergyError = energyError / solnNorm;

      if (rank==0)
      {
        cout << "Relative energy error for refinement " << refNumber++ << ": " << relativeEnergyError << endl;
      }

      if ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements))
      {
        refinementStrategy->refine();
        if (rank==0)
        {
          cout << "After refinement, mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n";
        }
      }

    }
    while ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements));

    double t_slab_final = (timeSlab+1) * timeLengthPerSlab;
    int frameOrdinal = lastFrameOutputted + 1;
    vector<double> timesForSlab;
    while (frameTimes[frameOrdinal] < t_slab_final)
    {
      double t = frameTimes[frameOrdinal];
      if (rank==0) cout << "exporting t=" << t << " on slab " << timeSlab << endl;
      FunctionPtr sliceFunction = MeshTools::timeSliceFunction(mesh, cellMap, u_spacetime, t);
      sliceExporter.exportFunction(sliceFunction, "u_slice", t);
      lastFrameOutputted = frameOrdinal++;
    }

    // set up next mesh/solution:
    FunctionPtr q_prev = Function::solution(qHat, soln);

//    cout << "Error in setup of q_prev: simple solution doesn't know about the map from the previous time slab to the current one. (TODO: fix this.)\n";

    double tn = (timeSlab+1) * timeLengthPerSlab;
    origin[2] = tn;
    mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

    FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, soln->mesh(), mesh, tn) ); // negate because the normals go in opposite directions

    bc = BC::bc();
    bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
    bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer);

    // IMPORTANT: now that we are ready to step to next soln, nullify BC.  If we do not do this, then we have an RCP chain
    //            that extends back to the first time slab, effectively a memory leak.
    soln->setBC(BC::bc());

    soln = Solution::solution(mesh, bc, RHS::rhs(), ip);
    soln->setUseCondensedSolve(useCondensedSolve);
  }

  return 0;
}
int main(int argc, char *argv[])
{

#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);

  Epetra_MpiComm Comm(MPI_COMM_WORLD);
#else
  Epetra_SerialComm Comm;
#endif

  int commRank = Teuchos::GlobalMPISession::getRank();

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  // problem parameters:
  int spaceDim = 2;
  double epsilon = 1e-2;
  int numRefs = 0;
  int k = 2, delta_k = 2;
  int numXElems = 1;
  bool useConformingTraces = true;
  string solverChoice = "KLU";
  string coarseSolverChoice = "KLU"; // often this beats SuperLU_Dist as coarse solver (true on BG/Q with 6000 3D elements on 256 ranks)
  double solverTolerance = 1e-6;
  string norm = "CoupledRobust";
  cmdp.setOption("spaceDim", &spaceDim, "spatial dimension");
  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");
  cmdp.setOption("numRefs",&numRefs,"number of refinements");
  cmdp.setOption("numXElems",&numXElems,"number of elements in x direction");
  cmdp.setOption("epsilon", &epsilon, "epsilon");
  cmdp.setOption("norm", &norm, "norm");
  cmdp.setOption("conformingTraces", "nonconformingTraces", &useConformingTraces, "use conforming traces");
  cmdp.setOption("coarseSolver", &coarseSolverChoice, "KLU, SuperLU");
  cmdp.setOption("solver", &solverChoice, "KLU, SuperLU, MUMPS, GMG-Direct, GMG-ILU, GMG-IC");
  cmdp.setOption("solverTolerance", &solverTolerance, "iterative solver tolerance");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  FunctionPtr beta;
  FunctionPtr beta_x = Function::constant(1);
  FunctionPtr beta_y = Function::constant(2);
  FunctionPtr beta_z = Function::constant(3);
  if (spaceDim == 1)
    beta = beta_x;
  else if (spaceDim == 2)
    beta = Function::vectorize(beta_x, beta_y);
  else if (spaceDim == 3)
    beta = Function::vectorize(beta_x, beta_y, beta_z);

  ConvectionDiffusionFormulation form(spaceDim, useConformingTraces, beta, epsilon);

  // Define right hand side
  RHSPtr rhs = RHS::rhs();

  // Set up boundary conditions
  BCPtr bc = BC::bc();
  VarPtr uhat = form.uhat();
  VarPtr tc = form.tc();
  SpatialFilterPtr inflowX = SpatialFilter::matchingX(-1);
  SpatialFilterPtr inflowY = SpatialFilter::matchingY(-1);
  SpatialFilterPtr inflowZ = SpatialFilter::matchingZ(-1);
  SpatialFilterPtr outflowX = SpatialFilter::matchingX(1);
  SpatialFilterPtr outflowY = SpatialFilter::matchingY(1);
  SpatialFilterPtr outflowZ = SpatialFilter::matchingZ(1);
  FunctionPtr zero = Function::zero();
  FunctionPtr one = Function::constant(1);
  FunctionPtr x = Function::xn(1);
  FunctionPtr y = Function::yn(1);
  FunctionPtr z = Function::zn(1);
  if (spaceDim == 1)
  {
    bc->addDirichlet(tc, inflowX, -one);
    bc->addDirichlet(uhat, outflowX, zero);
  }
  if (spaceDim == 2)
  {
    bc->addDirichlet(tc, inflowX, -1*.5*(one-y));
    bc->addDirichlet(uhat, outflowX, zero);
    bc->addDirichlet(tc, inflowY, -2*.5*(one-x));
    bc->addDirichlet(uhat, outflowY, zero);
  }
  if (spaceDim == 3)
  {
    bc->addDirichlet(tc, inflowX, -1*.25*(one-y)*(one-z));
    bc->addDirichlet(uhat, outflowX, zero);
    bc->addDirichlet(tc, inflowY, -2*.25*(one-x)*(one-z));
    bc->addDirichlet(uhat, outflowY, zero);
    bc->addDirichlet(tc, inflowZ, -3*.25*(one-x)*(one-y));
    bc->addDirichlet(uhat, outflowZ, zero);
  }

  // Build mesh
  vector<double> x0 = vector<double>(spaceDim,-1.0);
  double width = 2.0;
  vector<double> dimensions;
  vector<int> elementCounts;
  for (int d=0; d<spaceDim; d++)
  {
    dimensions.push_back(width);
    elementCounts.push_back(numXElems);
  }
  MeshPtr mesh = MeshFactory::rectilinearMesh(form.bf(), dimensions, elementCounts, k+1, delta_k, x0);
  MeshPtr k0Mesh = Teuchos::rcp( new Mesh (mesh->getTopology()->deepCopy(), form.bf(), 1, delta_k) );
  mesh->registerObserver(k0Mesh);

  // Set up solution
  SolutionPtr soln = Solution::solution(form.bf(), mesh, bc, rhs, form.ip(norm));

  double threshold = 0.20;
  RefinementStrategy refStrategy(soln, threshold);

  ostringstream refName;
  refName << "confusion" << spaceDim << "D_" << norm << "_" << epsilon << "_k" << k << "_" << solverChoice;
  // HDF5Exporter exporter(mesh,refName.str());

  Teuchos::RCP<Time> solverTime = Teuchos::TimeMonitor::getNewCounter("Solve Time");

  if (commRank == 0)
    Solver::printAvailableSolversReport();
  map<string, SolverPtr> solvers;
  solvers["KLU"] = Solver::getSolver(Solver::KLU, true);
  SolverPtr superluSolver = Solver::getSolver(Solver::SuperLUDist, true);
  solvers["SuperLU"] = superluSolver;
  
  int maxIters = 2000;
  bool useStaticCondensation = false;
  int azOutput = 20; // print residual every 20 CG iterations

  ofstream dataFile(refName.str()+".txt");
  dataFile << "ref\t " << "elements\t " << "dofs\t " << "error\t " << "solvetime\t" << "iterations\t " << endl;
  for (int refIndex=0; refIndex <= numRefs; refIndex++)
  {
    solverTime->start(true);
    Teuchos::RCP<GMGSolver> gmgSolver;
    if (solverChoice[0] == 'G')
    {
      gmgSolver = Teuchos::rcp( new GMGSolver(soln, k0Mesh, maxIters, solverTolerance, solvers[coarseSolverChoice], useStaticCondensation));
      
      gmgSolver->setAztecOutput(azOutput);
      if (solverChoice == "GMG-Direct")
        gmgSolver->gmgOperator().setSchwarzFactorizationType(GMGOperator::Direct);
      if (solverChoice == "GMG-ILU")
        gmgSolver->gmgOperator().setSchwarzFactorizationType(GMGOperator::ILU);
      if (solverChoice == "GMG-IC")
        gmgSolver->gmgOperator().setSchwarzFactorizationType(GMGOperator::IC);
      soln->solve(gmgSolver);
    }
    else
      soln->condensedSolve(solvers[solverChoice]);
    double solveTime = solverTime->stop();

    double energyError = soln->energyErrorTotal();
    if (commRank == 0)
    {
      // if (refIndex > 0)
      // refStrategy.printRefinementStatistics(refIndex-1);
      if (solverChoice[0] == 'G')
      {
        cout << "Refinement: " << refIndex
             << " \tElements: " << mesh->numActiveElements()
             << " \tDOFs: " << mesh->numGlobalDofs()
             << " \tEnergy Error: " << energyError
             << " \tSolve Time: " << solveTime
             << " \tIteration Count: " << gmgSolver->iterationCount()
             << endl;
        dataFile << refIndex
                 << " " << mesh->numActiveElements()
                 << " " << mesh->numGlobalDofs()
                 << " " << energyError
                 << " " << solveTime
                 << " " << gmgSolver->iterationCount()
                 << endl;
      }
      else
      {
        cout << "Refinement: " << refIndex
             << " \tElements: " << mesh->numActiveElements()
             << " \tDOFs: " << mesh->numGlobalDofs()
             << " \tEnergy Error: " << energyError
             << " \tSolve Time: " << solveTime
             << endl;
        dataFile << refIndex
                 << " " << mesh->numActiveElements()
                 << " " << mesh->numGlobalDofs()
                 << " " << energyError
                 << " " << solveTime
                 << endl;
      }
    }

    // exporter.exportSolution(soln, refIndex);

    if (refIndex != numRefs)
      refStrategy.refine();
  }
  dataFile.close();

  return 0;
}