int main(int argc, char *argv[])
{
#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
  choice::MpiArgs args( argc, argv );
#else
  choice::Args args( argc, argv );
#endif
  int commRank = Teuchos::GlobalMPISession::getRank();
  int numProcs = Teuchos::GlobalMPISession::getNProc();

  // Required arguments
  int numRefs = args.Input<int>("--numRefs", "number of refinement steps");
  int norm = args.Input<int>("--norm", "0 = graph\n    1 = robust\n    2 = coupled robust");

  // Optional arguments (have defaults)
  bool enforceLocalConservation = args.Input<bool>("--conserve", "enforce local conservation", false);
  double Re = args.Input("--Re", "Reynolds number", 40);
  double nu = 1./Re;
  double lambda = Re/2.-sqrt(Re*Re/4+4*pi*pi);
  int maxNewtonIterations = args.Input("--maxIterations", "maximum number of Newton iterations", 20);
  int polyOrder = args.Input("--polyOrder", "polynomial order for field variables", 2);
  int deltaP = args.Input("--deltaP", "how much to enrich test space", 2);
  // string saveFile = args.Input<string>("--meshSaveFile", "file to which to save refinement history", "");
  // string replayFile = args.Input<string>("--meshLoadFile", "file with refinement history to replay", "");
  args.Process();

  // if (commRank==0)
  // {
  //   cout << "saveFile is " << saveFile << endl;
  //   cout << "loadFile is " << replayFile << endl;
  // }

  ////////////////////   PROBLEM DEFINITIONS   ///////////////////////
  int H1Order = polyOrder+1;

  ////////////////////   DECLARE VARIABLES   ///////////////////////
  // define test variables
  VarFactory varFactory;
  // VarPtr tau11 = varFactory.testVar("tau11", HGRAD);
  // VarPtr tau12 = varFactory.testVar("tau12", HGRAD);
  // VarPtr tau22 = varFactory.testVar("tau22", HGRAD);
  VarPtr tau1 = varFactory.testVar("tau1", HDIV);
  VarPtr tau2 = varFactory.testVar("tau2", HDIV);
  VarPtr v1 = varFactory.testVar("v1", HGRAD);
  VarPtr v2 = varFactory.testVar("v2", HGRAD);
  VarPtr q = varFactory.testVar("q", HGRAD);

  // define trial variables
  VarPtr u1 = varFactory.fieldVar("u1");
  VarPtr u2 = varFactory.fieldVar("u2");
  // VarPtr sigma11 = varFactory.fieldVar("sigma11");
  // VarPtr sigma12 = varFactory.fieldVar("sigma12");
  // VarPtr sigma22 = varFactory.fieldVar("sigma22");
  VarPtr sigma1 = varFactory.fieldVar("sigma1", VECTOR_L2);
  VarPtr sigma2 = varFactory.fieldVar("sigma2", VECTOR_L2);
  VarPtr u1hat = varFactory.traceVar("u1hat");
  VarPtr u2hat = varFactory.traceVar("u2hat");
  VarPtr t1hat = varFactory.fluxVar("t1hat");
  VarPtr t2hat = varFactory.fluxVar("t2hat");
  VarPtr p = varFactory.fieldVar("p");

  ////////////////////   BUILD MESH   ///////////////////////
  BFPtr bf = Teuchos::rcp( new BF(varFactory) );

  // define nodes for mesh
  FieldContainer<double> meshBoundary(4,2);
  double xmin = -0.5;
  double xmax =  1.0;
  double ymin = -0.5;
  double ymax =  1.5;

  meshBoundary(0,0) =  xmin; // x1
  meshBoundary(0,1) =  ymin; // y1
  meshBoundary(1,0) =  xmax;
  meshBoundary(1,1) =  ymin;
  meshBoundary(2,0) =  xmax;
  meshBoundary(2,1) =  ymax;
  meshBoundary(3,0) =  xmin;
  meshBoundary(3,1) =  ymax;

  int horizontalCells = 6, verticalCells = 8;

  // create a pointer to a new mesh:
  Teuchos::RCP<Mesh> mesh = Mesh::buildQuadMesh(meshBoundary, horizontalCells, verticalCells,
                            bf, H1Order, H1Order+deltaP);

  ////////////////////////////////////////////////////////////////////
  // INITIALIZE BACKGROUND FLOW FUNCTIONS
  ////////////////////////////////////////////////////////////////////

  BCPtr nullBC = Teuchos::rcp((BC*)NULL);
  RHSPtr nullRHS = Teuchos::rcp((RHS*)NULL);
  IPPtr nullIP = Teuchos::rcp((IP*)NULL);
  SolutionPtr backgroundFlow = Teuchos::rcp(new Solution(mesh, nullBC, nullRHS, nullIP) );

  vector<double> e1(2); // (1,0)
  e1[0] = 1;
  vector<double> e2(2); // (0,1)
  e2[1] = 1;

  FunctionPtr u1_prev = Function::solution(u1, backgroundFlow);
  FunctionPtr u2_prev = Function::solution(u2, backgroundFlow);
  FunctionPtr sigma1_prev = Function::solution(sigma1, backgroundFlow);
  FunctionPtr sigma2_prev = Function::solution(sigma2, backgroundFlow);
  FunctionPtr p_prev = Function::solution(p, backgroundFlow);
  // FunctionPtr sigma11_prev = Function::solution(sigma11, backgroundFlow);
  // FunctionPtr sigma12_prev = Function::solution(sigma12, backgroundFlow);
  // FunctionPtr sigma22_prev = Function::solution(sigma22, backgroundFlow);

  FunctionPtr zero = Teuchos::rcp( new ConstantScalarFunction(0.0) );
  FunctionPtr one = Teuchos::rcp( new ConstantScalarFunction(1.0) );
  FunctionPtr u1Exact     = Teuchos::rcp( new ExactU1(lambda) );
  FunctionPtr u2Exact     = Teuchos::rcp( new ExactU2(lambda) );
  // FunctionPtr beta = e1 * u1_prev + e2 * u2_prev;

  // ==================== SET INITIAL GUESS ==========================
  map<int, Teuchos::RCP<Function> > functionMap;
  functionMap[u1->ID()] = u1Exact;
  functionMap[u2->ID()] = u2Exact;
  // functionMap[sigma1->ID()] = Function::vectorize(zero,zero);
  // functionMap[sigma2->ID()] = Function::vectorize(zero,zero);
  // functionMap[p->ID()] = zero;

  backgroundFlow->projectOntoMesh(functionMap);

  ////////////////////   DEFINE BILINEAR FORM   ///////////////////////

  // // stress equation
  bf->addTerm( 1./nu*sigma1, tau1 );
  bf->addTerm( 1./nu*sigma2, tau2 );
  bf->addTerm( u1, tau1->div() );
  bf->addTerm( u2, tau2->div() );
  bf->addTerm( -u1hat, tau1->dot_normal() );
  bf->addTerm( -u2hat, tau2->dot_normal() );
  // bf->addTerm( 1./(2*nu)*sigma11, tau11 );
  // bf->addTerm( 1./(2*nu)*sigma12, tau12 );
  // bf->addTerm( 1./(2*nu)*sigma12, tau12 );
  // bf->addTerm( 1./(2*nu)*sigma22, tau22 );
  // bf->addTerm( u1, tau11->dx() );
  // bf->addTerm( u1, tau12->dy() );
  // bf->addTerm( u2, tau12->dx() );
  // bf->addTerm( u2, tau22->dy() );
  // bf->addTerm( -u1hat, tau11->times_normal_x() );
  // bf->addTerm( -u1hat, tau12->times_normal_y() );
  // bf->addTerm( -u2hat, tau12->times_normal_x() );
  // bf->addTerm( -u2hat, tau22->times_normal_y() );

  // momentum equation
  bf->addTerm( -2.*u1_prev*u1, v1->dx() );
  bf->addTerm( -u2_prev*u1, v1->dy() );
  bf->addTerm( -u1_prev*u2, v1->dy() );
  bf->addTerm( -u2_prev*u1, v2->dx() );
  bf->addTerm( -u1_prev*u2, v1->dy() );
  bf->addTerm( -2.*u2_prev*u2, v2->dy() );
  bf->addTerm( -p, v1->dx() );
  bf->addTerm( -p, v2->dy() );
  // bf->addTerm( sigma11, v1->dx() );
  // bf->addTerm( sigma12, v1->dy() );
  // bf->addTerm( sigma12, v2->dx() );
  // bf->addTerm( sigma22, v2->dy() );
  bf->addTerm( sigma1, v1->grad() );
  bf->addTerm( sigma2, v2->grad() );
  bf->addTerm( t1hat, v1);
  bf->addTerm( t2hat, v2);

  // continuity equation
  bf->addTerm( -u1, q->dx() );
  bf->addTerm( -u2, q->dy() );
  bf->addTerm( u1hat, q->times_normal_x() );
  bf->addTerm( u2hat, q->times_normal_y() );

  ////////////////////   SPECIFY RHS   ///////////////////////
  Teuchos::RCP<RHSEasy> rhs = Teuchos::rcp( new RHSEasy );

  // stress equation
  rhs->addTerm( -u1_prev * tau1->div() );
  rhs->addTerm( -u2_prev * tau2->div() );

  // momentum equation
  rhs->addTerm( 2.*u1_prev*u1_prev * v1->dx() );
  rhs->addTerm( u2_prev*u1_prev    * v1->dy() );
  rhs->addTerm( u1_prev*u2_prev    * v1->dy() );
  rhs->addTerm( u2_prev*u1_prev    * v2->dx() );
  rhs->addTerm( u1_prev*u2_prev    * v1->dy() );
  rhs->addTerm( 2.*u2_prev*u2_prev * v2->dy() );
  // rhs->addTerm( p_prev             * v1->dx() );
  // rhs->addTerm( p_prev             * v2->dy() );
  // rhs->addTerm( -sigma1_prev       * v1->grad() );
  // rhs->addTerm( -sigma2_prev       * v2->grad() );

  // rhs->addTerm( -sigma11_prev * v1->dx() );
  // rhs->addTerm( -sigma12_prev * v1->dy() );
  // rhs->addTerm( -sigma12_prev * v2->dx() );
  // rhs->addTerm( -sigma22_prev * v2->dy() );

  // continuity equation
  rhs->addTerm( u1_prev * q->dx() );
  rhs->addTerm( u2_prev * q->dy() );

  ////////////////////   DEFINE INNER PRODUCT(S)   ///////////////////////
  IPPtr ip = Teuchos::rcp(new IP);
  if (norm == 0)
  {
    ip = bf->graphNorm();
  }
  else if (norm == 1)
  {
    // ip = bf->l2Norm();
  }

  ////////////////////   CREATE BCs   ///////////////////////
  Teuchos::RCP<BCEasy> bc = Teuchos::rcp( new BCEasy );
  // Teuchos::RCP<PenaltyConstraints> pc = Teuchos::rcp( new PenaltyConstraints );
  SpatialFilterPtr left = Teuchos::rcp( new ConstantXBoundary(-0.5) );
  SpatialFilterPtr right = Teuchos::rcp( new ConstantXBoundary(1) );
  SpatialFilterPtr top = Teuchos::rcp( new ConstantYBoundary(-0.5) );
  SpatialFilterPtr bottom = Teuchos::rcp( new ConstantYBoundary(1.5) );
  bc->addDirichlet(u1hat, left, u1Exact);
  bc->addDirichlet(u2hat, left, u2Exact);
  bc->addDirichlet(u1hat, right, u1Exact);
  bc->addDirichlet(u2hat, right, u2Exact);
  bc->addDirichlet(u1hat, top, u1Exact);
  bc->addDirichlet(u2hat, top, u2Exact);
  bc->addDirichlet(u1hat, bottom, u1Exact);
  bc->addDirichlet(u2hat, bottom, u2Exact);

  // zero mean constraint on pressure
  bc->addZeroMeanConstraint(p);

  // pc->addConstraint(u1hat*u2hat-t1hat == zero, top);
  // pc->addConstraint(u2hat*u2hat-t2hat == zero, top);

  Teuchos::RCP<Solution> solution = Teuchos::rcp( new Solution(mesh, bc, rhs, ip) );
  // solution->setFilter(pc);

  // if (enforceLocalConservation) {
  //   solution->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y() == zero);
  // }

  // ==================== Register Solutions ==========================
  mesh->registerSolution(solution);
  mesh->registerSolution(backgroundFlow);

  // Teuchos::RCP< RefinementHistory > refHistory = Teuchos::rcp( new RefinementHistory );
  // mesh->registerObserver(refHistory);

  ////////////////////   SOLVE & REFINE   ///////////////////////
  double energyThreshold = 0.2; // for mesh refinements
  RefinementStrategy refinementStrategy( solution, energyThreshold );
  VTKExporter exporter(backgroundFlow, mesh, varFactory);
  stringstream outfile;
  outfile << "kovasznay" << "_" << 0;
  exporter.exportSolution(outfile.str());

  double nonlinearRelativeEnergyTolerance = 1e-5; // used to determine convergence of the nonlinear solution
  for (int refIndex=0; refIndex<=numRefs; refIndex++)
  {
    double L2Update = 1e10;
    int iterCount = 0;
    while (L2Update > nonlinearRelativeEnergyTolerance && iterCount < maxNewtonIterations)
    {
      solution->solve(false);
      double u1L2Update = solution->L2NormOfSolutionGlobal(u1->ID());
      double u2L2Update = solution->L2NormOfSolutionGlobal(u2->ID());
      L2Update = sqrt(u1L2Update*u1L2Update + u2L2Update*u2L2Update);

      // Check local conservation
      if (commRank == 0)
      {
        cout << "L2 Norm of Update = " << L2Update << endl;

        // if (saveFile.length() > 0) {
        //   std::ostringstream oss;
        //   oss << string(saveFile) << refIndex ;
        //   cout << "on refinement " << refIndex << " saving mesh file to " << oss.str() << endl;
        //   refHistory->saveToFile(oss.str());
        // }
      }

      // line search algorithm
      double alpha = 1.0;
      backgroundFlow->addSolution(solution, alpha);
      iterCount++;
    }

    if (commRank == 0)
    {
      stringstream outfile;
      outfile << "kovasznay" << "_" << refIndex+1;
      exporter.exportSolution(outfile.str());
    }

    if (refIndex < numRefs)
      refinementStrategy.refine(commRank==0); // print to console on commRank 0
  }

  return 0;
}
int main(int argc, char *argv[])
{
#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
  choice::MpiArgs args( argc, argv );
#else
  choice::Args args( argc, argv );
#endif
  int commRank = Teuchos::GlobalMPISession::getRank();
  int numProcs = Teuchos::GlobalMPISession::getNProc();

  // Required arguments
  int numRefs = args.Input<int>("--numRefs", "number of refinement steps");
  int norm = args.Input<int>("--norm", "0 = graph\n    1 = robust\n    2 = coupled robust");

  // Optional arguments (have defaults)
  int uniformRefinements = args.Input("--uniformRefinements", "number of uniform refinements", 0);
  bool enforceLocalConservation = args.Input<bool>("--conserve", "enforce local conservation", false);
  double radius = args.Input("--r", "cylinder radius", 0.6);
  int Re = args.Input("--Re", "Reynolds number", 1);
  int maxNewtonIterations = args.Input("--maxIterations", "maximum number of Newton iterations", 1);
  int polyOrder = args.Input("--polyOrder", "polynomial order for field variables", 2);
  int deltaP = args.Input("--deltaP", "how much to enrich test space", 2);
  // string saveFile = args.Input<string>("--meshSaveFile", "file to which to save refinement history", "");
  // string replayFile = args.Input<string>("--meshLoadFile", "file with refinement history to replay", "");
  args.Process();

  ////////////////////   PROBLEM DEFINITIONS   ///////////////////////
  int H1Order = polyOrder+1;

  ////////////////////   DECLARE VARIABLES   ///////////////////////
  // define test variables
  VarFactory varFactory;
  VarPtr tau1 = varFactory.testVar("tau1", HDIV);
  VarPtr tau2 = varFactory.testVar("tau2", HDIV);
  VarPtr v1 = varFactory.testVar("v1", HGRAD);
  VarPtr v2 = varFactory.testVar("v2", HGRAD);
  VarPtr vc = varFactory.testVar("vc", HGRAD);

  // define trial variables
  VarPtr u1 = varFactory.fieldVar("u1");
  VarPtr u2 = varFactory.fieldVar("u2");
  VarPtr p = varFactory.fieldVar("p");
  VarPtr u1hat = varFactory.traceVar("u1hat");
  VarPtr u2hat = varFactory.traceVar("u2hat");
  VarPtr t1hat = varFactory.fluxVar("t1hat");
  VarPtr t2hat = varFactory.fluxVar("t2hat");
  VarPtr sigma1 = varFactory.fieldVar("sigma1", VECTOR_L2);
  VarPtr sigma2 = varFactory.fieldVar("sigma2", VECTOR_L2);

  ////////////////////   BUILD MESH   ///////////////////////
  BFPtr bf = Teuchos::rcp( new BF(varFactory) );

  // create a pointer to a new mesh:
  Teuchos::RCP<Mesh> mesh = MeshFactory::shiftedHemkerMesh(-1, 3, 2, radius, bf, H1Order, deltaP);

  ////////////////////////////////////////////////////////////////////
  // INITIALIZE BACKGROUND FLOW FUNCTIONS
  ////////////////////////////////////////////////////////////////////

  BCPtr nullBC = Teuchos::rcp((BC*)NULL);
  RHSPtr nullRHS = Teuchos::rcp((RHS*)NULL);
  IPPtr nullIP = Teuchos::rcp((IP*)NULL);
  SolutionPtr backgroundFlow = Teuchos::rcp(new Solution(mesh, nullBC, nullRHS, nullIP) );

  vector<double> e1(2); // (1,0)
  e1[0] = 1;
  vector<double> e2(2); // (0,1)
  e2[1] = 1;

  FunctionPtr u1_prev = Function::solution(u1, backgroundFlow);
  FunctionPtr u2_prev = Function::solution(u2, backgroundFlow);
  FunctionPtr sigma1_prev = Function::solution(sigma1, backgroundFlow);
  FunctionPtr sigma2_prev = Function::solution(sigma2, backgroundFlow);

  FunctionPtr zero = Teuchos::rcp( new ConstantScalarFunction(0.0) );
  FunctionPtr one = Teuchos::rcp( new ConstantScalarFunction(1.0) );
  FunctionPtr beta = e1 * u1_prev + e2 * u2_prev;

  // ==================== SET INITIAL GUESS ==========================
  map<int, Teuchos::RCP<Function> > functionMap;
  functionMap[u1->ID()] = one;
  functionMap[u2->ID()] = zero;
  functionMap[sigma1->ID()] = Function::vectorize(zero,zero);
  functionMap[sigma2->ID()] = Function::vectorize(zero,zero);
  functionMap[p->ID()] = zero;

  backgroundFlow->projectOntoMesh(functionMap);

  ////////////////////   DEFINE BILINEAR FORM   ///////////////////////

  // // stress equation
  bf->addTerm( sigma1, tau1 );
  bf->addTerm( sigma2, tau2 );
  bf->addTerm( u1, tau1->div() );
  bf->addTerm( u2, tau2->div() );
  bf->addTerm( -u1hat, tau1->dot_normal() );
  bf->addTerm( -u2hat, tau2->dot_normal() );

  // momentum equation
  // bf->addTerm( Function::xPart(sigma1_prev)*u1, v1 );
  // bf->addTerm( Function::yPart(sigma1_prev)*u2, v1 );
  // bf->addTerm( Function::xPart(sigma2_prev)*u1, v2 );
  // bf->addTerm( Function::yPart(sigma2_prev)*u2, v2 );
  // bf->addTerm( beta*sigma1, v1);
  // bf->addTerm( beta*sigma2, v2);
  bf->addTerm( 1./Re*sigma1, v1->grad() );
  bf->addTerm( 1./Re*sigma2, v2->grad() );
  bf->addTerm( t1hat, v1);
  bf->addTerm( t2hat, v2);
  bf->addTerm( -p, v1->dx() );
  bf->addTerm( -p, v2->dy() );

  // continuity equation
  bf->addTerm( -u1, vc->dx() );
  bf->addTerm( -u2, vc->dy() );
  bf->addTerm( u1hat, vc->times_normal_x() );
  bf->addTerm( u2hat, vc->times_normal_y() );

  ////////////////////   SPECIFY RHS   ///////////////////////
  Teuchos::RCP<RHSEasy> rhs = Teuchos::rcp( new RHSEasy );

  // stress equation
  rhs->addTerm( -sigma1_prev * tau1 );
  rhs->addTerm( -sigma2_prev * tau2 );
  rhs->addTerm( -u1_prev * tau1->div() );
  rhs->addTerm( -u2_prev * tau2->div() );

  // momentum equation
  // rhs->addTerm( -beta*sigma1_prev * v1 );
  // rhs->addTerm( -beta*sigma2_prev * v2 );
  rhs->addTerm( -1./Re*sigma1_prev * v1->grad() );
  rhs->addTerm( -1./Re*sigma2_prev * v2->grad() );

  // continuity equation
  rhs->addTerm( u1_prev * vc->dx() );
  rhs->addTerm( u2_prev * vc->dy() );

  ////////////////////   DEFINE INNER PRODUCT(S)   ///////////////////////
  IPPtr ip = Teuchos::rcp(new IP);
  if (norm == 0)
  {
    ip = bf->graphNorm();
  }
  else if (norm == 1)
  {
    // ip = bf->l2Norm();
  }

  ////////////////////   CREATE BCs   ///////////////////////
  Teuchos::RCP<BCEasy> bc = Teuchos::rcp( new BCEasy );
  SpatialFilterPtr left = Teuchos::rcp( new ConstantXBoundary(-1) );
  SpatialFilterPtr right = Teuchos::rcp( new ConstantXBoundary(3) );
  SpatialFilterPtr top = Teuchos::rcp( new ConstantYBoundary(1) );
  SpatialFilterPtr bottom = Teuchos::rcp( new ConstantYBoundary(-1) );
  SpatialFilterPtr circle = Teuchos::rcp( new CircleBoundary(radius) );
  FunctionPtr boundaryU1 = Teuchos::rcp( new BoundaryU1 );
  bc->addDirichlet(u1hat, left, boundaryU1);
  bc->addDirichlet(u2hat, left, zero);
  bc->addDirichlet(u1hat, right, boundaryU1);
  bc->addDirichlet(u2hat, right, zero);
  bc->addDirichlet(u1hat, top, zero);
  bc->addDirichlet(u2hat, top, zero);
  bc->addDirichlet(u1hat, bottom, zero);
  bc->addDirichlet(u2hat, bottom, zero);
  bc->addDirichlet(u1hat, circle, zero);
  bc->addDirichlet(u2hat, circle, zero);

  // zero mean constraint on pressure
  bc->addZeroMeanConstraint(p);

  Teuchos::RCP<Solution> solution = Teuchos::rcp( new Solution(mesh, bc, rhs, ip) );

  if (enforceLocalConservation)
  {
    solution->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y() == zero);
  }

  // ==================== Register Solutions ==========================
  mesh->registerSolution(solution);
  mesh->registerSolution(backgroundFlow);

  // Teuchos::RCP< RefinementHistory > refHistory = Teuchos::rcp( new RefinementHistory );
  // mesh->registerObserver(refHistory);

  ////////////////////   SOLVE & REFINE   ///////////////////////
  double energyThreshold = 0.2; // for mesh refinements
  RefinementStrategy refinementStrategy( solution, energyThreshold );
  VTKExporter exporter(backgroundFlow, mesh, varFactory);
  ofstream errOut;
  ofstream fluxOut;
  if (commRank == 0)
  {
    errOut.open("stokeshemker_err.txt");
    fluxOut.open("stokeshemker_flux.txt");
  }
  errOut.precision(15);
  fluxOut.precision(15);

  // Cell IDs for flux calculations
  vector< pair<ElementPtr, int> > cellFace0;
  vector< pair<ElementPtr, int> > cellFace1;
  vector< pair<ElementPtr, int> > cellFace2;
  vector< pair<ElementPtr, int> > cellFace3;
  vector< pair<ElementPtr, int> > cellFace4;
  cellFace0.push_back(make_pair(mesh->getElement(12), 3));
  cellFace0.push_back(make_pair(mesh->getElement(13), 3));
  cellFace0.push_back(make_pair(mesh->getElement(14), 3));
  cellFace0.push_back(make_pair(mesh->getElement(15), 3));
  cellFace1.push_back(make_pair(mesh->getElement(12), 1));
  cellFace1.push_back(make_pair(mesh->getElement(13), 1));
  cellFace1.push_back(make_pair(mesh->getElement(14), 1));
  cellFace1.push_back(make_pair(mesh->getElement(15), 1));
  cellFace2.push_back(make_pair(mesh->getElement(11), 1));
  cellFace2.push_back(make_pair(mesh->getElement(2 ), 0));
  cellFace2.push_back(make_pair(mesh->getElement(5 ), 2));
  cellFace2.push_back(make_pair(mesh->getElement(16), 1));
  cellFace3.push_back(make_pair(mesh->getElement(9 ), 3));
  cellFace3.push_back(make_pair(mesh->getElement(8 ), 3));
  cellFace3.push_back(make_pair(mesh->getElement(19), 3));
  cellFace3.push_back(make_pair(mesh->getElement(18), 3));
  cellFace4.push_back(make_pair(mesh->getElement(9 ), 1));
  cellFace4.push_back(make_pair(mesh->getElement(8 ), 1));
  cellFace4.push_back(make_pair(mesh->getElement(19), 1));
  cellFace4.push_back(make_pair(mesh->getElement(18), 1));

  // // for loading refinement history
  // if (replayFile.length() > 0) {
  //   RefinementHistory refHistory;
  //   replayFile = replayFile;
  //   refHistory.loadFromFile(replayFile);
  //   refHistory.playback(mesh);
  //   int numElems = mesh->numActiveElements();
  //   if (commRank==0){
  //     double minSideLength = meshInfo.getMinCellSideLength() ;
  //     cout << "after replay, num elems = " << numElems << " and min side length = " << minSideLength << endl;
  //   }
  // }

  for (int i = 0; i < uniformRefinements; i++)
    refinementStrategy.hRefineUniformly(mesh);

  double nonlinearRelativeEnergyTolerance = 1e-5; // used to determine convergence of the nonlinear solution
  for (int refIndex=0; refIndex<=numRefs; refIndex++)
  {
    double L2Update = 1e10;
    int iterCount = 0;
    while (L2Update > nonlinearRelativeEnergyTolerance && iterCount < maxNewtonIterations)
    {
      solution->solve(false);
      double u1L2Update = solution->L2NormOfSolutionGlobal(u1->ID());
      double u2L2Update = solution->L2NormOfSolutionGlobal(u2->ID());
      L2Update = sqrt(u1L2Update*u1L2Update + u2L2Update*u2L2Update);
      double energy_error = solution->energyErrorTotal();

      // Check local conservation
      if (commRank == 0)
      {
        FunctionPtr n = Function::normal();
        FunctionPtr u1_prev = Function::solution(u1hat, solution);
        FunctionPtr u2_prev = Function::solution(u2hat, solution);
        FunctionPtr flux = u1_prev*n->x() + u2_prev*n->y();
        Teuchos::Tuple<double, 3> fluxImbalances = checkConservation(flux, zero, mesh);
        // cout << "Mass flux: Largest Local = " << fluxImbalances[0]
        //   << ", Global = " << fluxImbalances[1] << ", Sum Abs = " << fluxImbalances[2] << endl;

        errOut << mesh->numGlobalDofs() << " " << energy_error << " "
               << fluxImbalances[0] << " " << fluxImbalances[1] << " " << fluxImbalances[2] << endl;

        double massFlux0 = computeFluxOverElementSides(u1_prev, mesh, cellFace0);
        double massFlux1 = computeFluxOverElementSides(u1_prev, mesh, cellFace1);
        double massFlux2 = computeFluxOverElementSides(u1_prev, mesh, cellFace2);
        double massFlux3 = computeFluxOverElementSides(u1_prev, mesh, cellFace3);
        double massFlux4 = computeFluxOverElementSides(u1_prev, mesh, cellFace4);
        fluxOut << massFlux0 << " " << massFlux1 << " " << massFlux2 << " " << massFlux3 << " " << massFlux4 << " " << endl;
        cout << "Total mass flux = " << massFlux0 << " " << massFlux1 << " " << massFlux2 << " " << massFlux3 << " " << massFlux4 << " " << endl;

        // if (saveFile.length() > 0) {
        //   std::ostringstream oss;
        //   oss << string(saveFile) << refIndex ;
        //   cout << "on refinement " << refIndex << " saving mesh file to " << oss.str() << endl;
        //   refHistory->saveToFile(oss.str());
        // }
      }

      // line search algorithm
      double alpha = 1.0;
      // bool useLineSearch = false;
      // int posEnrich = 5; // amount of enriching of grid points on which to ensure positivity
      // if (useLineSearch){ // to enforce positivity of density rho
      //   double lineSearchFactor = .5; double eps = .001; // arbitrary
      //   FunctionPtr rhoTemp = Function::solution(rho,backgroundFlow) + alpha*Function::solution(rho,solution) - Function::constant(eps);
      //   FunctionPtr eTemp = Function::solution(e,backgroundFlow) + alpha*Function::solution(e,solution) - Function::constant(eps);
      //   bool rhoIsPositive = rhoTemp->isPositive(mesh,posEnrich);
      //   bool eIsPositive = eTemp->isPositive(mesh,posEnrich);
      //   int iter = 0; int maxIter = 20;
      //   while (!(rhoIsPositive && eIsPositive) && iter < maxIter){
      //     alpha = alpha*lineSearchFactor;
      //     rhoTemp = Function::solution(rho,backgroundFlow) + alpha*Function::solution(rho,solution);
      //     eTemp = Function::solution(e,backgroundFlow) + alpha*Function::solution(e,solution);
      //     rhoIsPositive = rhoTemp->isPositive(mesh,posEnrich);
      //     eIsPositive = eTemp->isPositive(mesh,posEnrich);
      //     iter++;
      //   }
      //   if (commRank==0 && alpha < 1.0){
      //     cout << "line search factor alpha = " << alpha << endl;
      //   }
      // }

      backgroundFlow->addSolution(solution, alpha, false, true);
      iterCount++;
      // if (commRank == 0)
      //   cout << "L2 Norm of Update = " << L2Update << endl;
    }
    if (commRank == 0)
      cout << endl;

    if (commRank == 0)
    {
      stringstream outfile;
      outfile << "stokeshemker" << uniformRefinements << "_" << refIndex;
      exporter.exportSolution(outfile.str());
    }

    if (refIndex < numRefs)
      refinementStrategy.refine(commRank==0); // print to console on commRank 0
  }
  if (commRank == 0)
  {
    errOut.close();
    fluxOut.close();
  }

  return 0;
}
int main(int argc, char *argv[])
{
  int rank = 0;
#ifdef HAVE_MPI
  // TODO: figure out the right thing to do here...
  // may want to modify argc and argv before we make the following call:
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
  rank=mpiSession.getRank();
#else
#endif
  bool useLineSearch = false;

  int pToAdd = 2; // for optimal test function approximation
  int pToAddForStreamFunction = 2;
  double nonlinearStepSize = 1.0;
  double dt = 0.5;
  double nonlinearRelativeEnergyTolerance = 0.015; // used to determine convergence of the nonlinear solution
  //  double nonlinearRelativeEnergyTolerance = 0.15; // used to determine convergence of the nonlinear solution
  double eps = 1.0/64.0; // width of ramp up to 1.0 for top BC;  eps == 0 ==> soln not in H1
  // epsilon above is chosen to match our initial 16x16 mesh, to avoid quadrature errors.
  //  double eps = 0.0; // John Evans's problem: not in H^1
  bool enforceLocalConservation = false;
  bool enforceOneIrregularity = true;
  bool reportPerCellErrors  = true;
  bool useMumps = true;

  int horizontalCells, verticalCells;

  int maxIters = 50; // for nonlinear steps

  vector<double> ReValues;

  // usage: polyOrder [numRefinements]
  // parse args:
  if (argc < 6)
  {
    cout << "Usage: NavierStokesCavityFlowContinuationFixedMesh fieldPolyOrder hCells vCells energyErrorGoal Re0 [Re1 ...]\n";
    return -1;
  }
  int polyOrder = atoi(argv[1]);
  horizontalCells = atoi(argv[2]);
  verticalCells = atoi(argv[3]);
  double energyErrorGoal = atof(argv[4]);
  for (int i=5; i<argc; i++)
  {
    ReValues.push_back(atof(argv[i]));
  }
  if (rank == 0)
  {
    cout << "L^2 order: " << polyOrder << endl;
    cout << "initial mesh size: " << horizontalCells << " x " << verticalCells << endl;
    cout << "energy error goal: " << energyErrorGoal << endl;
    cout << "Reynolds number values for continuation:\n";
    for (int i=0; i<ReValues.size(); i++)
    {
      cout << ReValues[i] << ", ";
    }
    cout << endl;
  }

  FieldContainer<double> quadPoints(4,2);

  quadPoints(0,0) = 0.0; // x1
  quadPoints(0,1) = 0.0; // y1
  quadPoints(1,0) = 1.0;
  quadPoints(1,1) = 0.0;
  quadPoints(2,0) = 1.0;
  quadPoints(2,1) = 1.0;
  quadPoints(3,0) = 0.0;
  quadPoints(3,1) = 1.0;

  // define meshes:
  int H1Order = polyOrder + 1;
  bool useTriangles = false;
  bool meshHasTriangles = useTriangles;

  double minL2Increment = 1e-8;

  // get variable definitions:
  VarFactory varFactory = VGPStokesFormulation::vgpVarFactory();
  u1 = varFactory.fieldVar(VGP_U1_S);
  u2 = varFactory.fieldVar(VGP_U2_S);
  sigma11 = varFactory.fieldVar(VGP_SIGMA11_S);
  sigma12 = varFactory.fieldVar(VGP_SIGMA12_S);
  sigma21 = varFactory.fieldVar(VGP_SIGMA21_S);
  sigma22 = varFactory.fieldVar(VGP_SIGMA22_S);
  p = varFactory.fieldVar(VGP_P_S);

  u1hat = varFactory.traceVar(VGP_U1HAT_S);
  u2hat = varFactory.traceVar(VGP_U2HAT_S);
  t1n = varFactory.fluxVar(VGP_T1HAT_S);
  t2n = varFactory.fluxVar(VGP_T2HAT_S);

  v1 = varFactory.testVar(VGP_V1_S, HGRAD);
  v2 = varFactory.testVar(VGP_V2_S, HGRAD);
  tau1 = varFactory.testVar(VGP_TAU1_S, HDIV);
  tau2 = varFactory.testVar(VGP_TAU2_S, HDIV);
  q = varFactory.testVar(VGP_Q_S, HGRAD);

  FunctionPtr u1_0 = Teuchos::rcp( new U1_0(eps) );
  FunctionPtr u2_0 = Teuchos::rcp( new U2_0 );
  FunctionPtr zero = Function::zero();
  ParameterFunctionPtr Re_param = ParameterFunction::parameterFunction(1);
  VGPNavierStokesProblem problem = VGPNavierStokesProblem(Re_param,quadPoints,
                                   horizontalCells,verticalCells,
                                   H1Order, pToAdd,
                                   u1_0, u2_0,  // BC for u
                                   zero, zero); // zero forcing function
  SolutionPtr solution = problem.backgroundFlow();
  SolutionPtr solnIncrement = problem.solutionIncrement();

  Teuchos::RCP<Mesh> mesh = problem.mesh();
  mesh->registerSolution(solution);
  mesh->registerSolution(solnIncrement);

  ///////////////////////////////////////////////////////////////////////////

  // define bilinear form for stream function:
  VarFactory streamVarFactory;
  VarPtr phi_hat = streamVarFactory.traceVar("\\widehat{\\phi}");
  VarPtr psin_hat = streamVarFactory.fluxVar("\\widehat{\\psi}_n");
  VarPtr psi_1 = streamVarFactory.fieldVar("\\psi_1");
  VarPtr psi_2 = streamVarFactory.fieldVar("\\psi_2");
  VarPtr phi = streamVarFactory.fieldVar("\\phi");
  VarPtr q_s = streamVarFactory.testVar("q_s", HGRAD);
  VarPtr v_s = streamVarFactory.testVar("v_s", HDIV);
  BFPtr streamBF = Teuchos::rcp( new BF(streamVarFactory) );
  streamBF->addTerm(psi_1, q_s->dx());
  streamBF->addTerm(psi_2, q_s->dy());
  streamBF->addTerm(-psin_hat, q_s);

  streamBF->addTerm(psi_1, v_s->x());
  streamBF->addTerm(psi_2, v_s->y());
  streamBF->addTerm(phi, v_s->div());
  streamBF->addTerm(-phi_hat, v_s->dot_normal());

  Teuchos::RCP<Mesh> streamMesh, overkillMesh;

  streamMesh = MeshFactory::buildQuadMesh(quadPoints, horizontalCells, verticalCells,
                                          streamBF, H1Order+pToAddForStreamFunction,
                                          H1Order+pToAdd+pToAddForStreamFunction, useTriangles);

  mesh->registerObserver(streamMesh); // will refine streamMesh in the same way as mesh.

  map<int, double> dofsToL2error; // key: numGlobalDofs, value: total L2error compared with overkill
  vector< VarPtr > fields;
  fields.push_back(u1);
  fields.push_back(u2);
  fields.push_back(sigma11);
  fields.push_back(sigma12);
  fields.push_back(sigma21);
  fields.push_back(sigma22);
  fields.push_back(p);

  if (rank == 0)
  {
    cout << "Starting mesh has " << horizontalCells << " x " << verticalCells << " elements and ";
    cout << mesh->numGlobalDofs() << " total dofs.\n";
    cout << "polyOrder = " << polyOrder << endl;
    cout << "pToAdd = " << pToAdd << endl;
    cout << "eps for top BC = " << eps << endl;

    if (useTriangles)
    {
      cout << "Using triangles.\n";
    }
    if (enforceLocalConservation)
    {
      cout << "Enforcing local conservation.\n";
    }
    else
    {
      cout << "NOT enforcing local conservation.\n";
    }
    if (enforceOneIrregularity)
    {
      cout << "Enforcing 1-irregularity.\n";
    }
    else
    {
      cout << "NOT enforcing 1-irregularity.\n";
    }
  }

  ////////////////////   CREATE BCs   ///////////////////////
  SpatialFilterPtr entireBoundary = Teuchos::rcp( new SpatialFilterUnfiltered );

  FunctionPtr u1_prev = Function::solution(u1,solution);
  FunctionPtr u2_prev = Function::solution(u2,solution);

  FunctionPtr u1hat_prev = Function::solution(u1hat,solution);
  FunctionPtr u2hat_prev = Function::solution(u2hat,solution);


  ////////////////////   SOLVE & REFINE   ///////////////////////

  FunctionPtr vorticity = Teuchos::rcp( new PreviousSolutionFunction(solution, - u1->dy() + u2->dx() ) );
  //  FunctionPtr vorticity = Teuchos::rcp( new PreviousSolutionFunction(solution,sigma12 - sigma21) );
  RHSPtr streamRHS = RHS::rhs();
  streamRHS->addTerm(vorticity * q_s);
  ((PreviousSolutionFunction*) vorticity.get())->setOverrideMeshCheck(true);
  ((PreviousSolutionFunction*) u1_prev.get())->setOverrideMeshCheck(true);
  ((PreviousSolutionFunction*) u2_prev.get())->setOverrideMeshCheck(true);

  BCPtr streamBC = BC::bc();
  //  streamBC->addDirichlet(psin_hat, entireBoundary, u0_cross_n);
  streamBC->addDirichlet(phi_hat, entireBoundary, zero);
  //  streamBC->addZeroMeanConstraint(phi);

  IPPtr streamIP = Teuchos::rcp( new IP );
  streamIP->addTerm(q_s);
  streamIP->addTerm(q_s->grad());
  streamIP->addTerm(v_s);
  streamIP->addTerm(v_s->div());
  SolutionPtr streamSolution = Teuchos::rcp( new Solution( streamMesh, streamBC, streamRHS, streamIP ) );

  if (enforceLocalConservation)
  {
    FunctionPtr zero = Function::zero();
    solution->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y()==zero);
    solnIncrement->lagrangeConstraints()->addConstraint(u1hat->times_normal_x() + u2hat->times_normal_y()==zero);
  }

  if (true)
  {
    FunctionPtr u1_incr = Function::solution(u1, solnIncrement);
    FunctionPtr u2_incr = Function::solution(u2, solnIncrement);
    FunctionPtr sigma11_incr = Function::solution(sigma11, solnIncrement);
    FunctionPtr sigma12_incr = Function::solution(sigma12, solnIncrement);
    FunctionPtr sigma21_incr = Function::solution(sigma21, solnIncrement);
    FunctionPtr sigma22_incr = Function::solution(sigma22, solnIncrement);
    FunctionPtr p_incr = Function::solution(p, solnIncrement);

    FunctionPtr l2_incr = u1_incr * u1_incr + u2_incr * u2_incr + p_incr * p_incr
                          + sigma11_incr * sigma11_incr + sigma12_incr * sigma12_incr
                          + sigma21_incr * sigma21_incr + sigma22_incr * sigma22_incr;

    double energyThreshold = 0.20;
    Teuchos::RCP< RefinementStrategy > refinementStrategy = Teuchos::rcp( new RefinementStrategy( solnIncrement, energyThreshold ));

    for (int i=0; i<ReValues.size(); i++)
    {
      double Re = ReValues[i];
      Re_param->setValue(Re);
      if (rank==0) cout << "Solving with Re = " << Re << ":\n";
      double energyErrorTotal;
      do
      {
        double incr_norm;
        do
        {
          problem.iterate(useLineSearch);
          incr_norm = sqrt(l2_incr->integrate(problem.mesh()));
          if (rank==0)
          {
            cout << "\x1B[2K"; // Erase the entire current line.
            cout << "\x1B[0E"; // Move to the beginning of the current line.
            cout << "Iteration: " << problem.iterationCount() << "; L^2(incr) = " << incr_norm;
            flush(cout);
          }
        }
        while ((incr_norm > minL2Increment ) && (problem.iterationCount() < maxIters));
        if (rank==0) cout << endl;
        problem.setIterationCount(1); // 1 means reuse background flow (which we must, given that we want continuation in Re...)
        energyErrorTotal = solnIncrement->energyErrorTotal(); //solution->energyErrorTotal();
        if (energyErrorTotal > energyErrorGoal)
        {
          refinementStrategy->refine(false);
        }
        if (rank==0)
        {
          cout << "Energy error: " << energyErrorTotal << endl;
        }
      }
      while (energyErrorTotal > energyErrorGoal);
    }
  }

  double energyErrorTotal = solution->energyErrorTotal();
  double incrementalEnergyErrorTotal = solnIncrement->energyErrorTotal();
  if (rank == 0)
  {
    cout << "final mesh has " << mesh->numActiveElements() << " elements and " << mesh->numGlobalDofs() << " dofs.\n";
    cout << "energy error: " << energyErrorTotal << endl;
    cout << "  (Incremental solution's energy error is " << incrementalEnergyErrorTotal << ".)\n";
  }

  FunctionPtr u1_sq = u1_prev * u1_prev;
  FunctionPtr u_dot_u = u1_sq + (u2_prev * u2_prev);
  FunctionPtr u_mag = Teuchos::rcp( new SqrtFunction( u_dot_u ) );
  FunctionPtr u_div = Teuchos::rcp( new PreviousSolutionFunction(solution, u1->dx() + u2->dy() ) );
  FunctionPtr massFlux = Teuchos::rcp( new PreviousSolutionFunction(solution, u1hat->times_normal_x() + u2hat->times_normal_y()) );

  // check that the zero mean pressure is being correctly imposed:
  FunctionPtr p_prev = Teuchos::rcp( new PreviousSolutionFunction(solution,p) );
  double p_avg = p_prev->integrate(mesh);
  if (rank==0)
    cout << "Integral of pressure: " << p_avg << endl;

  // integrate massFlux over each element (a test):
  // fake a new bilinear form so we can integrate against 1
  VarPtr testOne = varFactory.testVar("1",CONSTANT_SCALAR);
  BFPtr fakeBF = Teuchos::rcp( new BF(varFactory) );
  LinearTermPtr massFluxTerm = massFlux * testOne;

  CellTopoPtrLegacy quadTopoPtr = Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData<shards::Quadrilateral<4> >() ));
  DofOrderingFactory dofOrderingFactory(fakeBF);
  int fakeTestOrder = H1Order;
  DofOrderingPtr testOrdering = dofOrderingFactory.testOrdering(fakeTestOrder, *quadTopoPtr);

  int testOneIndex = testOrdering->getDofIndex(testOne->ID(),0);
  vector< ElementTypePtr > elemTypes = mesh->elementTypes(); // global element types
  map<int, double> massFluxIntegral; // cellID -> integral
  double maxMassFluxIntegral = 0.0;
  double totalMassFlux = 0.0;
  double totalAbsMassFlux = 0.0;
  double maxCellMeasure = 0;
  double minCellMeasure = 1;
  for (vector< ElementTypePtr >::iterator elemTypeIt = elemTypes.begin(); elemTypeIt != elemTypes.end(); elemTypeIt++)
  {
    ElementTypePtr elemType = *elemTypeIt;
    vector< ElementPtr > elems = mesh->elementsOfTypeGlobal(elemType);
    vector<GlobalIndexType> cellIDs;
    for (int i=0; i<elems.size(); i++)
    {
      cellIDs.push_back(elems[i]->cellID());
    }
    FieldContainer<double> physicalCellNodes = mesh->physicalCellNodesGlobal(elemType);
    BasisCachePtr basisCache = Teuchos::rcp( new BasisCache(elemType,mesh,polyOrder) ); // enrich by trial space order
    basisCache->setPhysicalCellNodes(physicalCellNodes,cellIDs,true); // true: create side caches
    FieldContainer<double> cellMeasures = basisCache->getCellMeasures();
    FieldContainer<double> fakeRHSIntegrals(elems.size(),testOrdering->totalDofs());
    massFluxTerm->integrate(fakeRHSIntegrals,testOrdering,basisCache,true); // true: force side evaluation
    //      cout << "fakeRHSIntegrals:\n" << fakeRHSIntegrals;
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      // pick out the ones for testOne:
      massFluxIntegral[cellID] = fakeRHSIntegrals(i,testOneIndex);
    }
    // find the largest:
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      maxMassFluxIntegral = max(abs(massFluxIntegral[cellID]), maxMassFluxIntegral);
    }
    for (int i=0; i<elems.size(); i++)
    {
      int cellID = cellIDs[i];
      maxCellMeasure = max(maxCellMeasure,cellMeasures(i));
      minCellMeasure = min(minCellMeasure,cellMeasures(i));
      maxMassFluxIntegral = max(abs(massFluxIntegral[cellID]), maxMassFluxIntegral);
      totalMassFlux += massFluxIntegral[cellID];
      totalAbsMassFlux += abs( massFluxIntegral[cellID] );
    }
  }
  if (rank==0)
  {
    cout << "largest mass flux: " << maxMassFluxIntegral << endl;
    cout << "total mass flux: " << totalMassFlux << endl;
    cout << "sum of mass flux absolute value: " << totalAbsMassFlux << endl;
    cout << "largest h: " << sqrt(maxCellMeasure) << endl;
    cout << "smallest h: " << sqrt(minCellMeasure) << endl;
    cout << "ratio of largest / smallest h: " << sqrt(maxCellMeasure) / sqrt(minCellMeasure) << endl;
  }
  if (rank == 0)
  {
    cout << "phi ID: " << phi->ID() << endl;
    cout << "psi1 ID: " << psi_1->ID() << endl;
    cout << "psi2 ID: " << psi_2->ID() << endl;

    cout << "streamMesh has " << streamMesh->numActiveElements() << " elements.\n";
    cout << "solving for approximate stream function...\n";
  }

  streamSolution->solve(useMumps);
  energyErrorTotal = streamSolution->energyErrorTotal();
  if (rank == 0)
  {
    cout << "...solved.\n";
    cout << "Stream mesh has energy error: " << energyErrorTotal << endl;
  }

  if (rank==0)
  {
    solution->writeToVTK("nsCavitySoln.vtk");
    if (! meshHasTriangles )
    {
      massFlux->writeBoundaryValuesToMATLABFile(solution->mesh(), "massFlux.dat");
      u_mag->writeValuesToMATLABFile(solution->mesh(), "u_mag.m");
      u_div->writeValuesToMATLABFile(solution->mesh(), "u_div.m");
      solution->writeFieldsToFile(u1->ID(), "u1.m");
      solution->writeFluxesToFile(u1hat->ID(), "u1_hat.dat");
      solution->writeFieldsToFile(u2->ID(), "u2.m");
      solution->writeFluxesToFile(u2hat->ID(), "u2_hat.dat");
      solution->writeFieldsToFile(p->ID(), "p.m");
      streamSolution->writeFieldsToFile(phi->ID(), "phi.m");

      streamSolution->writeFluxesToFile(phi_hat->ID(), "phi_hat.dat");
      streamSolution->writeFieldsToFile(psi_1->ID(), "psi1.m");
      streamSolution->writeFieldsToFile(psi_2->ID(), "psi2.m");
      vorticity->writeValuesToMATLABFile(streamMesh, "vorticity.m");

      FunctionPtr ten = Teuchos::rcp( new ConstantScalarFunction(10) );
      ten->writeBoundaryValuesToMATLABFile(solution->mesh(), "skeleton.dat");
      cout << "wrote files: u_mag.m, u_div.m, u1.m, u1_hat.dat, u2.m, u2_hat.dat, p.m, phi.m, vorticity.m.\n";
    }
    else
    {
      solution->writeToFile(u1->ID(), "u1.dat");
      solution->writeToFile(u2->ID(), "u2.dat");
      solution->writeToFile(u2->ID(), "p.dat");
      cout << "wrote files: u1.dat, u2.dat, p.dat\n";
    }

    FieldContainer<double> points = pointGrid(0, 1, 0, 1, 100);
    FieldContainer<double> pointData = solutionData(points, streamSolution, phi);
    GnuPlotUtil::writeXYPoints("phi_patch_navierStokes_cavity.dat", pointData);
    set<double> patchContourLevels = diagonalContourLevels(pointData,1);
    vector<string> patchDataPath;
    patchDataPath.push_back("phi_patch_navierStokes_cavity.dat");
    GnuPlotUtil::writeContourPlotScript(patchContourLevels, patchDataPath, "lidCavityNavierStokes.p");

    GnuPlotUtil::writeExactMeshSkeleton("lid_navierStokes_continuation_adaptive", mesh, 2);

    writePatchValues(0, 1, 0, 1, streamSolution, phi, "phi_patch.m");
    writePatchValues(0, .1, 0, .1, streamSolution, phi, "phi_patch_detail.m");
    writePatchValues(0, .01, 0, .01, streamSolution, phi, "phi_patch_minute_detail.m");
    writePatchValues(0, .001, 0, .001, streamSolution, phi, "phi_patch_minute_minute_detail.m");
  }

  return 0;
}
int main(int argc, char *argv[])
{

#ifdef HAVE_MPI
  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);

  Epetra_MpiComm Comm(MPI_COMM_WORLD);
#else
  Epetra_SerialComm Comm;
#endif

  int commRank = Teuchos::GlobalMPISession::getRank();

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  // problem parameters:
  double mu = 0.1;
  double permCoef = 1e4;
  int numRefs = 0;
  int k = 2, delta_k = 2;
  string norm = "Graph";
  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");
  cmdp.setOption("numRefs",&numRefs,"number of refinements");
  cmdp.setOption("norm", &norm, "norm");
  cmdp.setOption("mu", &mu, "mu");
  cmdp.setOption("permCoef", &permCoef, "Permeability coefficient");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  FunctionPtr zero = TFunction<double>::zero();
  FunctionPtr one = TFunction<double>::constant(1);
  FunctionPtr sin2pix = Teuchos::rcp( new Sin_ax(2*pi) );
  FunctionPtr cos2pix = Teuchos::rcp( new Cos_ax(2*pi) );
  FunctionPtr sin2piy = Teuchos::rcp( new Sin_ay(2*pi) );
  FunctionPtr cos2piy = Teuchos::rcp( new Cos_ay(2*pi) );
  FunctionPtr u1_exact = sin2pix*cos2piy;
  FunctionPtr u2_exact = -cos2pix*sin2piy;
  FunctionPtr x2 = TFunction<double>::xn(2);
  FunctionPtr y2 = TFunction<double>::yn(2);
  FunctionPtr p_exact = x2*y2 - 1./9;
  FunctionPtr permInv = permCoef*(sin2pix + 1.1);

  VarFactoryPtr vf = VarFactory::varFactory();
  //fields:
  VarPtr sigma1 = vf->fieldVar("sigma1", VECTOR_L2);
  VarPtr sigma2 = vf->fieldVar("sigma2", VECTOR_L2);
  VarPtr u1 = vf->fieldVar("u1", L2);
  VarPtr u2 = vf->fieldVar("u2", L2);
  VarPtr p = vf->fieldVar("p", L2);

  // traces:
  VarPtr u1hat = vf->traceVar("u1hat");
  VarPtr u2hat = vf->traceVar("u2hat");
  VarPtr t1c = vf->fluxVar("t1c");
  VarPtr t2c = vf->fluxVar("t2c");

  // test:
  VarPtr v1 = vf->testVar("v1", HGRAD);
  VarPtr v2 = vf->testVar("v2", HGRAD);
  VarPtr tau1 = vf->testVar("tau1", HDIV);
  VarPtr tau2 = vf->testVar("tau2", HDIV);
  VarPtr q = vf->testVar("q", HGRAD);

  BFPtr bf = Teuchos::rcp( new BF(vf) );

  bf->addTerm(1./mu*sigma1, tau1);
  bf->addTerm(1./mu*sigma2, tau2);
  bf->addTerm(u1, tau1->div());
  bf->addTerm(u2, tau2->div());
  bf->addTerm(-u1hat, tau1->dot_normal());
  bf->addTerm(-u2hat, tau2->dot_normal());

  bf->addTerm(sigma1, v1->grad());
  bf->addTerm(sigma2, v2->grad());
  bf->addTerm(-p, v1->dx());
  bf->addTerm(-p, v2->dy());
  bf->addTerm(t1c, v1);
  bf->addTerm(t2c, v2);
  bf->addTerm(mu*permInv*u1, v1);
  bf->addTerm(mu*permInv*u2, v2);

  bf->addTerm(-u1, q->dx());
  bf->addTerm(-u2, q->dy());
  bf->addTerm(u1hat, q->times_normal_x());
  bf->addTerm(u2hat, q->times_normal_y());

  RHSPtr rhs = RHS::rhs();

  BCPtr bc = BC::bc();

  SpatialFilterPtr y_equals_one = SpatialFilter::matchingY(1.0);
  SpatialFilterPtr y_equals_zero = SpatialFilter::matchingY(0);
  SpatialFilterPtr x_equals_one = SpatialFilter::matchingX(1.0);
  SpatialFilterPtr x_equals_zero = SpatialFilter::matchingX(0.0);
  bc->addDirichlet(u1hat, y_equals_zero, u1_exact);
  bc->addDirichlet(u2hat, y_equals_zero, u2_exact);
  bc->addDirichlet(u1hat, x_equals_zero, u1_exact);
  bc->addDirichlet(u2hat, x_equals_zero, u2_exact);
  bc->addDirichlet(u1hat, y_equals_one, u1_exact);
  bc->addDirichlet(u2hat, y_equals_one, u2_exact);
  bc->addDirichlet(u1hat, x_equals_one, u1_exact);
  bc->addDirichlet(u2hat, x_equals_one, u2_exact);
  bc->addZeroMeanConstraint(p);

  MeshPtr mesh = MeshFactory::quadMesh(bf, k+1, delta_k, 1, 1, 4, 4);

  map<string, IPPtr> brinkmanIPs;
  brinkmanIPs["Graph"] = bf->graphNorm();

  brinkmanIPs["Decoupled"] = Teuchos::rcp(new IP);
  brinkmanIPs["Decoupled"]->addTerm(tau1);
  brinkmanIPs["Decoupled"]->addTerm(tau2);
  brinkmanIPs["Decoupled"]->addTerm(tau1->div());
  brinkmanIPs["Decoupled"]->addTerm(tau2->div());
  brinkmanIPs["Decoupled"]->addTerm(permInv*v1);
  brinkmanIPs["Decoupled"]->addTerm(permInv*v2);
  brinkmanIPs["Decoupled"]->addTerm(v1->grad());
  brinkmanIPs["Decoupled"]->addTerm(v2->grad());
  brinkmanIPs["Decoupled"]->addTerm(q);
  brinkmanIPs["Decoupled"]->addTerm(q->grad());

  // brinkmanIPs["CoupledRobust"] = Teuchos::rcp(new IP);
  // brinkmanIPs["CoupledRobust"]->addTerm(tau->div()-beta*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(Function<double>::min(one/Function<double>::h(),Function<double>::constant(1./sqrt(epsilon)))*tau);
  // brinkmanIPs["CoupledRobust"]->addTerm(sqrt(epsilon)*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(beta*v->grad());
  // brinkmanIPs["CoupledRobust"]->addTerm(Function<double>::min(sqrt(epsilon)*one/Function<double>::h(),one)*v);

  IPPtr ip = brinkmanIPs[norm];

  SolutionPtr soln = TSolution<double>::solution(mesh, bc, rhs, ip);

  double threshold = 0.20;
  RefinementStrategy refStrategy(soln, threshold);

  ostringstream refName;
  refName << "brinkman";
  HDF5Exporter exporter(mesh,refName.str());

  for (int refIndex=0; refIndex <= numRefs; refIndex++)
  {
    soln->solve(false);

    double energyError = soln->energyErrorTotal();
    if (commRank == 0)
    {
      // if (refIndex > 0)
      // refStrategy.printRefinementStatistics(refIndex-1);
      cout << "Refinement:\t " << refIndex << " \tElements:\t " << mesh->numActiveElements()
           << " \tDOFs:\t " << mesh->numGlobalDofs() << " \tEnergy Error:\t " << energyError << endl;
    }

    exporter.exportSolution(soln, refIndex);

    if (refIndex != numRefs)
      refStrategy.refine();
  }

  return 0;
}