// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int main(int argc, char *argv[]) 
{
#ifdef HAVE_MPI  
  MPI_Init(&argc, &argv);
#endif

  using namespace std;
  using namespace Teuchos;
  using namespace PHX;
  
  try {
    
    RCP<Time> total_time = TimeMonitor::getNewTimer("Total Run Time");
    TimeMonitor tm(*total_time);

    RCP<Time> residual_eval_time = 
      TimeMonitor::getNewTimer("Residual Evaluation Time");
    RCP<Time> jacobian_eval_time = 
      TimeMonitor::getNewTimer("Jacobian Evaluation Time");
    RCP<Time> linear_solve_time = 
      TimeMonitor::getNewTimer("Linear Solve Time");
    RCP<Time> nonlinear_solve_time = 
      TimeMonitor::getNewTimer("Nonlinear Solve Time");
    RCP<Time> preconditioner_solve_time = 
      TimeMonitor::getNewTimer("Preconditioner Time");
    RCP<Time> setup_time = 
      TimeMonitor::getNewTimer("Setup Time (not scalable)");
    RCP<Time> jv_eval_time = 
      TimeMonitor::getNewTimer("Jv (AD)");
    RCP<Time> matvec = 
      TimeMonitor::getNewTimer("Matvec");

    setup_time->start();

    bool print_debug_info = false;

#ifdef HAVE_MPI
    RCP<Epetra_Comm> comm = rcp(new Epetra_MpiComm(MPI_COMM_WORLD));
#else
    RCP<Epetra_Comm> comm = rcp(new Epetra_SerialComm);
#endif
    
    Teuchos::basic_FancyOStream<char> os(rcp(&std::cout,false));
    os.setShowProcRank(true);
    os.setProcRankAndSize(comm->MyPID(), comm->NumProc());

    if (comm->MyPID() == 0)
      cout << "\nStarting FEM_Nonlinear Example!\n" << endl;

    // *********************************************************
    // * Build the Finite Element data structures
    // *********************************************************

    // Problem dimension - a 2D problem
    const static int dim = 2;

    // Create the mesh
    MeshBuilder mb(comm, 10, 3, 1.0, 1.0, 8);

    if (print_debug_info) 
      os << mb;

    std::vector<Element_Linear2D>& cells = *(mb.myElements());

    // Divide mesh into workset blocks
    const std::size_t workset_size = 15;
    std::vector<MyWorkset> worksets;
    {
      std::vector<Element_Linear2D>::iterator cell_it = cells.begin();
      std::size_t count = 0;
      MyWorkset w;
      w.local_offset = cell_it->localElementIndex();
      w.begin = cell_it;
      for (; cell_it != cells.end(); ++cell_it) {
	++count;
	std::vector<Element_Linear2D>::iterator next = cell_it;
	++next;
	
	if ( count == workset_size || next == cells.end()) {
	  w.end = next;
	  w.num_cells = count;
	  worksets.push_back(w);
	  count = 0;
	  
	  if (next != cells.end()) {
	    w.local_offset = next->localElementIndex();
	    w.begin = next;
	  }
	}
      }
    }
    
    if (print_debug_info) {
      cout << "Printing Element Information" << endl;
      for (std::size_t i = 0; i < worksets.size(); ++i) {
	std::vector<Element_Linear2D>::iterator it = worksets[i].begin;
	for (; it != worksets[i].end; ++it)
	  cout << *it << endl;
      }
    }
    
    if (print_debug_info) {
      for (std::size_t i = 0; i < worksets.size(); ++i) {
	cout << "Printing Workset Information" << endl;
	cout << "worksets[" << i << "]" << endl;
	cout << "  num_cells =" << worksets[i].num_cells << endl;
	cout << "  local_offset =" << worksets[i].local_offset << endl;
	std::vector<Element_Linear2D>::iterator it = worksets[i].begin;
	for (; it != worksets[i].end; ++it)
	  cout << "  cell_local_index =" << it->localElementIndex() << endl;
      }
      cout << endl;
    }

    // *********************************************************
    // * Build the Newton solver data structures
    // *********************************************************

    // Setup Nonlinear Problem (build Epetra_Vector and Epetra_CrsMatrix)
    // Newton's method: J delta_x = -f
    const std::size_t num_eq = 2;
    
    LinearObjectFactory lof(mb, comm, num_eq);

    if (print_debug_info) {
      ofstream file("OwnedGraph.dat", ios::out | ios::app);
      Teuchos::basic_FancyOStream<char> p(rcp(&file,false)); 
      p.setShowProcRank(true); 
      p.setProcRankAndSize(comm->MyPID(), comm->NumProc()); 	
      lof.ownedGraph()->Print(p);
    }

    Epetra_Map owned_map = *(lof.ownedMap());
    Epetra_Map overlapped_map = *(lof.overlappedMap());
    Epetra_CrsGraph owned_graph = *(lof.ownedGraph());
    Epetra_CrsGraph overlapped_graph = *(lof.overlappedGraph());

    // Solution vector x
    RCP<Epetra_Vector> owned_x = rcp(new Epetra_Vector(owned_map,true));
    RCP<Epetra_Vector> overlapped_x =  
      rcp(new Epetra_Vector(overlapped_map,true));

    // Update vector x
    RCP<Epetra_Vector> owned_delta_x = rcp(new Epetra_Vector(owned_map,true));

    // Residual vector f
    RCP<Epetra_Vector> owned_f = rcp(new Epetra_Vector(owned_map,true));
    RCP<Epetra_Vector> overlapped_f =  
      rcp(new Epetra_Vector(overlapped_map,true));

    // Jacobian Matrix
    Epetra_DataAccess copy = ::Copy;
    RCP<Epetra_CrsMatrix> owned_jac = 
      rcp(new Epetra_CrsMatrix(copy, owned_graph));
    RCP<Epetra_CrsMatrix> overlapped_jac = 
      rcp(new Epetra_CrsMatrix(copy, overlapped_graph));

    // Import/export
    RCP<Epetra_Import> importer = 
      rcp(new Epetra_Import(overlapped_map, owned_map));
    RCP<Epetra_Export> exporter = 
      rcp(new Epetra_Export(overlapped_map, owned_map));

    // Sets bc for initial guess
    applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);
    
    // *********************************************************
    // * Build the FieldManager
    // *********************************************************

    RCP< vector<string> > dof_names = rcp(new vector<string>(num_eq));
    (*dof_names)[0] = "Temperature";
    (*dof_names)[1] = "Velocity X";

    RCP<DataLayout> qp_scalar = 
      rcp(new MDALayout<Cell,QuadPoint>(workset_size,4));
    RCP<DataLayout> node_scalar = 
      rcp(new MDALayout<Cell,Node>(workset_size,4));
    
    RCP<DataLayout> qp_vec = 
      rcp(new MDALayout<Cell,QuadPoint,Dim>(workset_size,4,dim));
    RCP<DataLayout> node_vec = 
      rcp(new MDALayout<Cell,Node,Dim>(workset_size,4,dim));

    RCP<DataLayout> dummy = rcp(new MDALayout<Cell>(0));
    
    map<string, RCP<ParameterList> > evaluators_to_build;
    
    { // Gather Solution
      RCP<ParameterList> p = rcp(new ParameterList);
      int type = MyFactoryTraits<MyTraits>::id_gather_solution;
      p->set<int>("Type", type);
      p->set< RCP< vector<string> > >("Solution Names", dof_names);
      p->set< RCP<Epetra_Vector> >("Solution Vector", overlapped_x);
      p->set< RCP<DataLayout> >("Data Layout", node_scalar);
      evaluators_to_build["Gather Solution"] = p;
    }
    
    { // FE Interpolation - Temperature
      RCP<ParameterList> p = rcp(new ParameterList);
      
      int type = MyFactoryTraits<MyTraits>::id_feinterpolation;
      p->set<int>("Type", type);
      
      p->set<string>("Node Variable Name", "Temperature");
      p->set<string>("QP Variable Name", "Temperature");
      p->set<string>("Gradient QP Variable Name", "Temperature Gradient");
      
      p->set< RCP<DataLayout> >("Node Data Layout", node_scalar);
      p->set< RCP<DataLayout> >("QP Scalar Data Layout", qp_scalar);
      p->set< RCP<DataLayout> >("QP Vector Data Layout", qp_vec);
      
      evaluators_to_build["FE Interpolation Temperature"] = p;
    }
     
    { // FE Interpolation - Velocity X
      RCP<ParameterList> p = rcp(new ParameterList);
      
      int type = MyFactoryTraits<MyTraits>::id_feinterpolation;
      p->set<int>("Type", type);
      
      p->set<string>("Node Variable Name", "Velocity X");
      p->set<string>("QP Variable Name", "Velocity X");
      p->set<string>("Gradient QP Variable Name", "Velocity X Gradient");
      
      p->set< RCP<DataLayout> >("Node Data Layout", node_scalar);
      p->set< RCP<DataLayout> >("QP Scalar Data Layout", qp_scalar);
      p->set< RCP<DataLayout> >("QP Vector Data Layout", qp_vec);
      
      evaluators_to_build["FE Interpolation Velocity X"] = p;
    }

    { // Evaluate residual
      RCP<ParameterList> p = rcp(new ParameterList);
      int type = MyFactoryTraits<MyTraits>::id_equations;
      p->set<int>("Type", type);
      p->set< RCP< vector<string> > >("Solution Names", dof_names);
      p->set< RCP<DataLayout> >("Node Data Layout", node_scalar);
      p->set< RCP<DataLayout> >("QP Data Layout", qp_scalar);
      p->set< RCP<DataLayout> >("Gradient QP Data Layout", qp_vec);
      evaluators_to_build["Equations"] = p;
    }
 
    { // Scatter Solution
      RCP<ParameterList> p = rcp(new ParameterList);
      int type = MyFactoryTraits<MyTraits>::id_scatter_residual;
      p->set<int>("Type", type);

      RCP< vector<string> > res_names = rcp(new vector<string>(num_eq));
      (*res_names)[0] = "Residual Temperature";
      (*res_names)[1] = "Residual Velocity X";

      p->set< RCP< vector<string> > >("Residual Names", res_names);
      p->set< RCP<Epetra_Vector> >("Residual Vector", overlapped_f);
      p->set< RCP<Epetra_CrsMatrix> >("Jacobian Matrix", overlapped_jac);
      p->set< RCP<DataLayout> >("Dummy Data Layout", dummy);
      p->set< RCP<DataLayout> >("Data Layout", node_scalar);
      evaluators_to_build["Scatter Residual"] = p;
    }
    
    // Build Field Evaluators for each evaluation type
    EvaluatorFactory<MyTraits,MyFactoryTraits<MyTraits> > factory;
    RCP< vector< RCP<Evaluator_TemplateManager<MyTraits> > > > 
      evaluators;
    evaluators = factory.buildEvaluators(evaluators_to_build);
    
    // Create a FieldManager
    FieldManager<MyTraits> fm;
      
    // Register all Evaluators 
    registerEvaluators(evaluators, fm);

    // Request quantities to assemble RESIDUAL PDE operators
    {
      typedef MyTraits::Residual::ScalarT ResScalarT;
      Tag<ResScalarT> res_tag("Scatter", dummy);
      fm.requireField<MyTraits::Residual>(res_tag);
      
      // Request quantities to assemble JACOBIAN PDE operators
      typedef MyTraits::Jacobian::ScalarT JacScalarT;
      Tag<JacScalarT> jac_tag("Scatter", dummy);
      fm.requireField<MyTraits::Jacobian>(jac_tag);

      // Request quantities to assemble Jv operators
      typedef MyTraits::Jv::ScalarT JvScalarT;
      Tag<JvScalarT> jv_tag("Scatter", dummy);
      fm.requireField<MyTraits::Jv>(jv_tag);
    }

    {
      RCP<Time> registration_time = 
	TimeMonitor::getNewTimer("Post Registration Setup Time");
      {
	TimeMonitor t(*registration_time);
	fm.postRegistrationSetupForType<MyTraits::Residual>(NULL);
	fm.postRegistrationSetupForType<MyTraits::Jacobian>(NULL);
	fm.postRegistrationSetupForType<MyTraits::Jv>(NULL);
      }
    }

    if (print_debug_info)
      cout << fm << endl;

    // *********************************************************
    // * Evaluate Jacobian and Residual (required for ML to 
    // * to be constructed properly
    // *********************************************************    
    {
      //TimeMonitor t(*jacobian_eval_time);

      overlapped_x->Import(*owned_x, *importer, Insert);
      
      owned_f->PutScalar(0.0);
      overlapped_f->PutScalar(0.0);
      owned_jac->PutScalar(0.0);
      overlapped_jac->PutScalar(0.0);
      
      for (std::size_t i = 0; i < worksets.size(); ++i)
	fm.evaluateFields<MyTraits::Jacobian>(worksets[i]);
      
      owned_f->Export(*overlapped_f, *exporter, Add);
      owned_jac->Export(*overlapped_jac, *exporter, Add);
      
      applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);
    }

    // *********************************************************
    // * Build Preconditioner (Ifpack or ML)
    // *********************************************************    
    bool use_ml = false;

    RCP<Belos::EpetraPrecOp> belosPrec;
    RCP<Ifpack_Preconditioner> ifpack_prec;
    RCP<ML_Epetra::MultiLevelPreconditioner> ml_prec;

    if (!use_ml) {
      Ifpack Factory;
      std::string PrecType = "ILU";
      int OverlapLevel = 0;
      ifpack_prec = 
	Teuchos::rcp( Factory.Create(PrecType,owned_jac.get(),OverlapLevel) );
      ParameterList ifpackList;
      ifpackList.set("fact: drop tolerance", 1e-9);
      ifpackList.set("fact: level-of-fill", 1);
      ifpackList.set("schwarz: combine mode", "Add");
      IFPACK_CHK_ERR(ifpack_prec->SetParameters(ifpackList));
      IFPACK_CHK_ERR(ifpack_prec->Initialize());
      belosPrec = rcp( new Belos::EpetraPrecOp( ifpack_prec ) );
    }
    else {
      ParameterList ml_params;
      ML_Epetra::SetDefaults("SA",ml_params);
      //ml_params.set("Base Method Defaults", "SA");
      ml_params.set("ML label", "Phalanx_Test");
      ml_params.set("ML output", 10);
      ml_params.set("print unused", 1);
      ml_params.set("max levels", 4);
      ml_params.set("PDE equations",2);
      ml_params.set("prec type","MGV");
      ml_params.set("increasing or decreasing","increasing");
      // ml_params.set("aggregation: nodes per aggregate",50);
      ml_params.set("aggregation: type","Uncoupled");
      ml_params.set("aggregation: damping factor", 0.0);
      ml_params.set("coarse: type","Amesos-KLU");
      //ml_params.set("coarse: type","IFPACK");
      ml_params.set("coarse: max size", 1000);
      //ml_params.set("smoother: type","IFPACK");
      ml_params.set("smoother: type","block Gauss-Seidel");
      ml_params.set("smoother: ifpack type","ILU");
      ml_params.set("smoother: ifpack overlap",1);
      ml_params.sublist("smoother: ifpack list").set("fact: level-of-fill",1);
      ml_params.sublist("smoother: ifpack list").set("schwarz: reordering type","rcm");
      ml_prec = rcp( new ML_Epetra::MultiLevelPreconditioner(*owned_jac, 
							     ml_params, 
							     true) );
      belosPrec = rcp( new Belos::EpetraPrecOp( ml_prec ) );
    }

    // *********************************************************
    // * Build linear solver (Belos)
    // *********************************************************
    
    // Linear solver parameters
    typedef double                            ST;
    typedef Teuchos::ScalarTraits<ST>        SCT;
    typedef SCT::magnitudeType                MT;
    typedef Epetra_MultiVector                MV;
    typedef Epetra_Operator                   OP;
    typedef Belos::MultiVecTraits<ST,MV>     MVT;
    typedef Belos::OperatorTraits<ST,MV,OP>  OPT;
    
    RCP<ParameterList> belosList = rcp(new ParameterList);
    belosList->set<int>("Num Blocks", 400);
    belosList->set<int>("Block Size", 1);
    belosList->set<int>("Maximum Iterations", 400);
    belosList->set<int>("Maximum Restarts", 0);
    belosList->set<MT>( "Convergence Tolerance", 1.0e-4);
    int verbosity = Belos::Errors + Belos::Warnings;
    if (false) {
      verbosity += Belos::TimingDetails + Belos::StatusTestDetails;
      belosList->set<int>( "Output Frequency", -1);
    }
    if (print_debug_info) {
      verbosity += Belos::Debug;
      belosList->set<int>( "Output Frequency", -1);
    }
    belosList->set( "Verbosity", verbosity );
    
    RCP<Epetra_MultiVector> F = 
      Teuchos::rcp_implicit_cast<Epetra_MultiVector>(owned_f);
    
    RCP<Epetra_MultiVector> DX = 
      Teuchos::rcp_implicit_cast<Epetra_MultiVector>(owned_delta_x);
    
    RCP<Belos::LinearProblem<double,MV,OP> > problem =
      rcp(new Belos::LinearProblem<double,MV,OP>(owned_jac, DX, F) );

    problem->setRightPrec( belosPrec );

    RCP< Belos::SolverManager<double,MV,OP> > gmres_solver = 
      rcp( new Belos::BlockGmresSolMgr<double,MV,OP>(problem, belosList) );
    
    setup_time->stop();










    // Timing for Mat-Vec using sacado
    RCP<Epetra_Vector> owned_v = rcp(new Epetra_Vector(owned_map,true));
    RCP<Epetra_Vector> overlapped_v = 
      rcp(new Epetra_Vector(overlapped_map,true));
    RCP<Epetra_Vector> owned_Jv = rcp(new Epetra_Vector(owned_map,true));
    RCP<Epetra_Vector> overlapped_Jv = 
      rcp(new Epetra_Vector(overlapped_map,true));

    owned_x->PutScalar(1.0);
    owned_v->PutScalar(1.0);

    int iter = 0;
    while (iter != 30) {

      overlapped_x->Import(*owned_x, *importer, Insert);

      overlapped_v->Import(*owned_v, *importer, Insert);

      owned_f->PutScalar(0.0);
      overlapped_f->PutScalar(0.0);
      owned_jac->PutScalar(0.0);
      overlapped_jac->PutScalar(0.0);
      owned_Jv->PutScalar(0.0);
      overlapped_Jv->PutScalar(0.0);

      for (std::size_t i = 0; i < worksets.size(); ++i) {
	worksets[i].v = overlapped_v;
	worksets[i].Jv = overlapped_Jv;
      }
      {
	TimeMonitor t(*residual_eval_time);
	for (std::size_t i = 0; i < worksets.size(); ++i)
	  fm.evaluateFields<MyTraits::Residual>(worksets[i]);
      }
      {
	TimeMonitor t(*jacobian_eval_time);
	for (std::size_t i = 0; i < worksets.size(); ++i)
	  fm.evaluateFields<MyTraits::Jacobian>(worksets[i]);
      }
      {
	TimeMonitor t(*jv_eval_time);
	for (std::size_t i = 0; i < worksets.size(); ++i)
	  fm.evaluateFields<MyTraits::Jv>(worksets[i]);
      }

      owned_f->Export(*overlapped_f, *exporter, Add);
      owned_jac->Export(*overlapped_jac, *exporter, Add);
      owned_Jv->Export(*overlapped_Jv, *exporter, Add);

      applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);

      {
	TimeMonitor t(*matvec);
	owned_jac->Apply(*owned_x, *owned_f);
      }

      ++iter;
    }

    //owned_jac->Print(std::cout);
    //overlapped_Jv->Print(std::cout);
    //owned_Jv->Print(std::cout);
    




    


    // NOTE: in the future we can set up the nonlinear solver below to
    // do Jacobian-Free Newton-Krylov solves to test the matvec


    /*


    // *********************************************************
    // * Solve the system
    // *********************************************************

    // Set initial guess
    owned_x->PutScalar(1.0);

    // Evaluate Residual
    {
      TimeMonitor t(*residual_eval_time);

      overlapped_x->Import(*owned_x, *importer, Insert);

      owned_f->PutScalar(0.0);
      overlapped_f->PutScalar(0.0);

      for (std::size_t i = 0; i < worksets.size(); ++i)
	fm.evaluateFields<MyTraits::Residual>(worksets[i]);
      
      owned_f->Export(*overlapped_f, *exporter, Add);

      applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);
    }
    
    if (print_debug_info) {
      printVector("x_owned", *owned_x, -1);
      printVector("f_owned", *owned_f, -1);
    }

    // Newton Loop
    bool converged = false;
    std::size_t num_newton_steps = 0;
    std::size_t num_gmres_iterations = 0;
    checkConvergence(comm->MyPID(), num_newton_steps, *owned_f, 
		     *owned_delta_x, converged);

    while (!converged && num_newton_steps < 20) {
      
      TimeMonitor t(*nonlinear_solve_time);

      // Evaluate Residual and Jacobian
      {
	TimeMonitor t(*jacobian_eval_time);

	overlapped_x->Import(*owned_x, *importer, Insert);

	owned_f->PutScalar(0.0);
	overlapped_f->PutScalar(0.0);
	owned_jac->PutScalar(0.0);
	overlapped_jac->PutScalar(0.0);
	
	for (std::size_t i = 0; i < worksets.size(); ++i)
	  fm.evaluateFields<MyTraits::Jacobian>(worksets[i]);

	owned_f->Export(*overlapped_f, *exporter, Add);
	owned_jac->Export(*overlapped_jac, *exporter, Add);

	applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);
      }

      if (print_debug_info) {
	printVector("x_owned", *owned_x, num_newton_steps);
	printVector("x_overlapped", *overlapped_x, num_newton_steps);
	printVector("f_owned", *owned_f, num_newton_steps);
	printMatrix("jacobian_owned", *owned_jac, num_newton_steps);
      }
      
      owned_f->Scale(-1.0);

      // Solve linear problem
      {
	TimeMonitor t(*linear_solve_time);
	
	owned_delta_x->PutScalar(0.0);

	{
	  TimeMonitor tp(*preconditioner_solve_time);
	  
	  if (use_ml)
	    ml_prec->ReComputePreconditioner();
	  else
	    IFPACK_CHK_ERR(ifpack_prec->Compute());
	}

	problem->setProblem();

	Belos::ReturnType ret = gmres_solver->solve();

	int num_iters = gmres_solver->getNumIters();
	num_gmres_iterations += num_iters; 
	//if (print_debug_info) 
	if (comm->MyPID() == 0)
	  std::cout << "Number of gmres iterations performed for this solve: " 
		    << num_iters << std::endl;
	
	if (ret!=Belos::Converged && comm->MyPID() == 0) {
	  std::cout << std::endl << "WARNING:  Belos did not converge!" 
		    << std::endl;
	}
	
      }
      
      owned_x->Update(1.0, *owned_delta_x, 1.0);

      { // Evaluate Residual Only
	TimeMonitor t(*residual_eval_time);
	
	overlapped_x->Import(*owned_x, *importer, Insert);

	owned_f->PutScalar(0.0);
	overlapped_f->PutScalar(0.0);

	for (std::size_t i = 0; i < worksets.size(); ++i)
	  fm.evaluateFields<MyTraits::Residual>(worksets[i]);
	
	owned_f->Export(*overlapped_f, *exporter, Add);

	applyBoundaryConditions(1.0, *owned_x, *owned_jac, *owned_f, mb);
      }

      num_newton_steps += 1;

      checkConvergence(comm->MyPID(), num_newton_steps, *owned_f, 
		       *owned_delta_x, converged);
    }
     
    if (print_debug_info)
      printVector("f_owned", *owned_f, num_newton_steps);

    if (comm->MyPID() == 0) {
      if (converged)
	cout << "\n\nNewton Iteration Converged!\n" << endl;
      else
	cout << "\n\nNewton Iteration Failed to Converge!\n" << endl;
    }

    RCP<Time> file_io = 
      TimeMonitor::getNewTimer("File IO");
    {
      TimeMonitor t(*file_io);
      
      // Create a list of node coordinates
      std::map<int, std::vector<double> > coordinates;
      Teuchos::RCP< std::vector<Element_Linear2D> > cells = mb.myElements();
      for (std::vector<Element_Linear2D>::iterator cell = cells->begin();
	   cell != cells->end(); ++cell) {
	
	const shards::Array<double,shards::NaturalOrder,Node,Dim>& coords = 
	  cell->nodeCoordinates();

	for (int node=0; node < cell->numNodes(); ++node) {
	  coordinates[cell->globalNodeId(node)].resize(dim);
	  coordinates[cell->globalNodeId(node)][0] = coords(node,0);
	  coordinates[cell->globalNodeId(node)][1] = coords(node,1);
	}	
      }


      {
	std::vector< RCP<ofstream> > files; 
	for (std::size_t eq = 0; eq < num_eq; ++eq) {
	  std::stringstream ost;
	  ost << "upper_DOF" << eq << "_PID" << comm->MyPID() << ".dat";
	  files.push_back( rcp(new std::ofstream(ost.str().c_str()), 
			       ios::out | ios::trunc) );
	  files[eq]->precision(10);
	}
	
	const std::vector<int>& node_list = mb.topNodeSetGlobalIds();
	for (std::size_t node = 0; node < node_list.size(); ++node) {
	  int lid = owned_x->Map().LID(node_list[node] * num_eq);
	  for (std::size_t eq = 0; eq < num_eq; ++eq) {
	    int dof_index = lid + eq;
	    *(files[eq]) << coordinates[node_list[node]][0] << "   " 
			 << (*owned_x)[dof_index] << endl;
	  }
	}
      }

      {
	std::vector< RCP<ofstream> > files; 
	for (std::size_t eq = 0; eq < num_eq; ++eq) {
	  std::stringstream ost;
	  ost << "lower_DOF" << eq << "_PID" << comm->MyPID() << ".dat";
	  files.push_back( rcp(new std::ofstream(ost.str().c_str()), 
			       ios::out | ios::trunc) );
	  files[eq]->precision(10);
	}
	
	const std::vector<int>& node_list = mb.bottomNodeSetGlobalIds();
	for (std::size_t node = 0; node < node_list.size(); ++node) {
	  int lid = owned_x->Map().LID(node_list[node] * num_eq);
	  for (std::size_t eq = 0; eq < num_eq; ++eq) {
	    int dof_index = lid + eq;
	    *(files[eq]) << coordinates[node_list[node]][0] << "   " 
			 << (*owned_x)[dof_index] << endl;
	  }
	}
      }

    }

    TEUCHOS_TEST_FOR_EXCEPTION(!converged, std::runtime_error,
		       "Problem failed to converge!");

    TEUCHOS_TEST_FOR_EXCEPTION(num_newton_steps != 10, std::runtime_error,
		       "Incorrect number of Newton steps!");

    // Only check num gmres steps in serial
#ifndef HAVE_MPI
    TEUCHOS_TEST_FOR_EXCEPTION(num_gmres_iterations != 10, std::runtime_error,
		       "Incorrect number of GMRES iterations!");
#endif


    */

    // *********************************************************************
    // Finished all testing
    // *********************************************************************
    if (comm->MyPID() == 0)
      std::cout << "\nRun has completed successfully!\n" << std::endl; 
    // *********************************************************************
    // *********************************************************************

  }
  catch (const std::exception& e) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Exception Caught!" << endl;
    std::cout << "Error message is below\n " << e.what() << endl;
    std::cout << "************************************************" << endl;
  }
  catch (...) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Unknown Exception Caught!" << endl;
    std::cout << "************************************************" << endl;
  }

  TimeMonitor::summarize();
    
#ifdef HAVE_MPI
  MPI_Finalize();
#endif

  return 0;
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
int main(int argc, char *argv[]) 
{
  using namespace std;
  using namespace Teuchos;
  using namespace PHX;
  
  GlobalMPISession mpi_session(&argc, &argv);

  try {

    PHX::InitializeKokkosDevice();
    
    RCP<Time> total_time = TimeMonitor::getNewTimer("Total Run Time");
    TimeMonitor tm(*total_time);

    bool print_debug_info = false;

    {
      cout << "\nStarting MultiDimensionalArray EnergyFlux Example!" << endl;

      // Assume we have 102 cells on processor and can fit 20 cells in cache
      typedef PHX::Device::size_type size_type;
      const size_type num_local_cells = 102;
      const size_type workset_size = 20;

      RCP<DataLayout> qp_scalar = 
	rcp(new MDALayout<Cell,Node>(workset_size,4));
      RCP<DataLayout> node_scalar = 
	rcp(new MDALayout<Cell,QuadPoint>(workset_size,4));

      RCP<DataLayout> qp_vec = 
	rcp(new MDALayout<Cell,Node,Dim>(workset_size,4,3));
      RCP<DataLayout> node_vec = 
	rcp(new MDALayout<Cell,QuadPoint,Dim>(workset_size,4,3));

      RCP<DataLayout> qp_mat = 
	rcp(new MDALayout<Cell,Node,Dim,Dim>(workset_size,4,3,3));
      RCP<DataLayout> node_mat = 
	rcp(new MDALayout<Cell,QuadPoint,Dim,Dim>(workset_size,4,3,3));

      // Parser will build parameter list that determines the field
      // evaluators to build
      map<string, RCP<ParameterList> > evaluators_to_build;
      
      { // Temperature
	RCP<ParameterList> p = rcp(new ParameterList);
	int type = MyFactoryTraits<MyTraits>::id_constant;
	p->set<int>("Type", type);
	p->set<string>("Name", "Temperature");
	p->set<double>("Value", 2.0);
	p->set< RCP<DataLayout> >("Data Layout", node_scalar);
	evaluators_to_build["DOF_Temperature"] = p;
      }
      { // Density
	RCP<ParameterList> p = rcp(new ParameterList);
	int type = MyFactoryTraits<MyTraits>::id_density;
	p->set<int>("Type", type);
	p->set< RCP<DataLayout> >("Data Layout", qp_scalar);
	evaluators_to_build["Density"] = p;
      }

      { // Constant Thermal Conductivity
	RCP<ParameterList> p = rcp(new ParameterList);
	int type = MyFactoryTraits<MyTraits>::id_constant;
	p->set<int>("Type", type);
	p->set<string>("Name", "Thermal Conductivity");
	p->set<double>("Value", 2.0);
	p->set< RCP<DataLayout> >("Data Layout", qp_scalar);
	evaluators_to_build["Thermal Conductivity"] = p;
      }
      
      { // Nonlinear Source
	RCP<ParameterList> p = rcp(new ParameterList);
	int type = MyFactoryTraits<MyTraits>::id_nonlinearsource;
	p->set<int>("Type", type);
	p->set< RCP<DataLayout> >("Data Layout", qp_scalar);
	evaluators_to_build["Nonlinear Source"] = p;
      }

      { // Fourier Energy Flux
	RCP<ParameterList> p = rcp(new ParameterList);
	int type = MyFactoryTraits<MyTraits>::id_fourier;
	p->set<int>("Type", type);
	p->set< RCP<DataLayout> >("Scalar Data Layout", qp_scalar);
	p->set< RCP<DataLayout> >("Vector Data Layout", qp_vec);
	evaluators_to_build["Energy Flux"] = p;
      }

      { // FE Interpolation
	RCP<ParameterList> p = rcp(new ParameterList);

	int type = MyFactoryTraits<MyTraits>::id_feinterpolation;
	p->set<int>("Type", type);

	p->set<string>("Node Variable Name", "Temperature");
	p->set<string>("QP Variable Name", "Temperature");
	p->set<string>("Gradient QP Variable Name", "Temperature Gradient");

	p->set< RCP<DataLayout> >("Node Data Layout", node_scalar);
	p->set< RCP<DataLayout> >("QP Scalar Data Layout", qp_scalar);
	p->set< RCP<DataLayout> >("QP Vector Data Layout", qp_vec);

	evaluators_to_build["FE Interpolation"] = p;
      }
      
      // Build Field Evaluators for each evaluation type
      EvaluatorFactory<MyTraits,MyFactoryTraits<MyTraits> > factory;
      RCP< vector< RCP<Evaluator_TemplateManager<MyTraits> > > >  evaluators;
      evaluators = factory.buildEvaluators(evaluators_to_build);
 
      // Create a FieldManager
      FieldManager<MyTraits> fm;
      
      // Register all Evaluators 
      registerEvaluators(evaluators, fm);

      // Request quantities to assemble RESIDUAL PDE operators
      typedef MyTraits::Residual::ScalarT ResScalarT;
      
      Tag<ResScalarT> energy_flux_tag("Energy_Flux", qp_vec);
      fm.requireField<MyTraits::Residual>(energy_flux_tag);
      
      Tag<ResScalarT> source_tag("Nonlinear Source", qp_scalar);
      fm.requireField<MyTraits::Residual>(source_tag);

      // Request quantities to assemble JACOBIAN PDE operators
      typedef MyTraits::Jacobian::ScalarT JacScalarT;
      
      Tag<JacScalarT> j_energy_flux_tag("Energy_Flux", qp_vec);
      fm.requireField<MyTraits::Jacobian>(j_energy_flux_tag);
      
      Tag<JacScalarT> j_source_tag("Nonlinear Source", qp_scalar);
      fm.requireField<MyTraits::Jacobian>(j_source_tag);

      // For Kokkos extended types (Sacado FAD) set derivtive array
      // size
      std::vector<PHX::index_size_type> derivative_dimensions;
      derivative_dimensions.push_back(8);
      fm.setKokkosExtendedDataTypeDimensions<MyTraits::Jacobian>(derivative_dimensions);

      RCP<Time> registration_time = 
	TimeMonitor::getNewTimer("Post Registration Setup Time");
      {
	TimeMonitor t(*registration_time);
	fm.postRegistrationSetup(NULL);
      }

      cout << fm << endl;

      fm.writeGraphvizFile<MyTraits::Residual>("graph_residual.dot");
      fm.writeGraphvizFile<MyTraits::Jacobian>("graph_jacobian.dot");
      fm.writeGraphvizFile("all_graph", ".dot");      

      // Create Workset information: Cells and EvalData objects
      std::vector<MyCell> cells(num_local_cells);
      for (std::size_t i = 0; i < cells.size(); ++i)
	cells[i].setLocalIndex(i);
      std::vector<MyWorkset> worksets;

      std::vector<MyCell>::iterator cell_it = cells.begin();
      std::size_t count = 0;
      MyWorkset w;
      w.local_offset = cell_it->localIndex();
      w.begin = cell_it;
      for (; cell_it != cells.end(); ++cell_it) {
	++count;
	std::vector<MyCell>::iterator next = cell_it;
	++next;
	
	if ( count == workset_size || next == cells.end()) {
	  w.end = next;
	  w.num_cells = count;
	  worksets.push_back(w);
	  count = 0;

	  if (next != cells.end()) {
	    w.local_offset = next->localIndex();
	    w.begin = next;
	  }
	}
      }
      
      if (print_debug_info) {
	for (std::size_t i = 0; i < worksets.size(); ++i) {
	  cout << "worksets[" << i << "]" << endl;
	  cout << "  num_cells =" << worksets[i].num_cells << endl;
	  cout << "  local_offset =" << worksets[i].local_offset << endl;
	  std::vector<MyCell>::iterator it = worksets[i].begin;
	  for (; it != worksets[i].end; ++it)
	    cout << "  cell_local_index =" << it->localIndex() << endl;
	}
	cout << endl;
      }

      // Create local vectors to hold flux and source at quad points
      std::vector<double> 
	local_energy_flux_at_qp(num_local_cells * qp_vec->size());
      std::vector<double> 
	local_source_at_qp(num_local_cells * qp_scalar->size());

      // Fields we require
      MDField<double,Cell,QuadPoint,Dim> energy_flux(energy_flux_tag);
      MDField<double,Cell,QuadPoint> source(source_tag);
      fm.getFieldData<double,MyTraits::Residual,Cell,QuadPoint,Dim>(energy_flux);
      fm.getFieldData<double,MyTraits::Residual,Cell,QuadPoint>(source);

      // Get host arrays to copy from device back to host
      auto host_energy_flux = Kokkos::create_mirror_view(energy_flux.get_kokkos_view());
      auto host_source = Kokkos::create_mirror_view(source.get_kokkos_view());

      RCP<Time> eval_time = TimeMonitor::getNewTimer("Evaluation Time");

      fm.preEvaluate<MyTraits::Residual>(NULL);
      {
	TimeMonitor t(*eval_time);

	for (std::size_t i = 0; i < worksets.size(); ++i) {
#ifdef PHX_ENABLE_KOKKOS_AMT
	  fm.evaluateFieldsTaskParallel<MyTraits::Residual>(1,worksets[i]);
#else
	  fm.evaluateFields<MyTraits::Residual>(worksets[i]);
#endif	  
	  // Use values: in this example, move values into local host arrays
	  Kokkos::deep_copy(host_energy_flux,energy_flux.get_kokkos_view());
	  Kokkos::deep_copy(host_source,source.get_kokkos_view());

	  for (size_type cell = 0; cell < energy_flux.dimension(0); ++cell) {
	    for (size_type ip = 0; ip < energy_flux.dimension(1); ++ip) {
	      for (size_type dim = 0; dim < energy_flux.dimension(2); ++dim) {
		std::size_t index = cell * energy_flux.dimension(0) +
		  ip * energy_flux.dimension(1) + dim;
		local_energy_flux_at_qp[index] =  host_energy_flux(cell,ip,dim);
	      }
	    }
	  }
	  for (size_type cell = 0; cell < energy_flux.dimension(0); ++cell) {
	    for (size_type ip = 0; ip < energy_flux.dimension(1); ++ip) {
	      std::size_t index = cell * energy_flux.dimension(0) + ip;
	      local_source_at_qp[index] =  host_source(cell,ip);
	    }
	  }

	}

      }
      fm.postEvaluate<MyTraits::Residual>(NULL);

      // Test data retrieval
      cout << "Testing data members" << endl;
      Tag<double> d_var("Density", qp_scalar);
      MDField<double> den(d_var); 
      fm.getFieldData<double,MyTraits::Residual>(den);
      cout << "size of density = " << den.size() << ", should be " 
	   << d_var.dataLayout().size() << "." << endl;
      TEUCHOS_TEST_FOR_EXCEPTION(den.size() != d_var.dataLayout().size(),
			 std::runtime_error, 
			 "Returned arrays are not sized correctly!");

      cout << endl;

     //Jacobian:
     std::vector<JacScalarT>
        j_local_energy_flux_at_qp(num_local_cells * qp_vec->size());
      std::vector<JacScalarT>
        j_local_source_at_qp(num_local_cells * qp_scalar->size());
      // Fields we require
      MDField<JacScalarT,Cell,QuadPoint,Dim> j_energy_flux(j_energy_flux_tag);
      MDField<JacScalarT,Cell,QuadPoint> j_source(j_source_tag);
      fm.getFieldData<JacScalarT,MyTraits::Jacobian,Cell,QuadPoint,Dim>(j_energy_flux);
      fm.getFieldData<JacScalarT,MyTraits::Jacobian,Cell,QuadPoint>(j_source);
      RCP<Time> eval_time2 = TimeMonitor::getNewTimer("Evaluation Time Jacobian");

      auto host_j_energy_flux = Kokkos::create_mirror_view(j_energy_flux.get_kokkos_view());
      auto host_j_source = Kokkos::create_mirror_view(j_source.get_kokkos_view());

      fm.preEvaluate<MyTraits::Jacobian>(NULL);
      {
        TimeMonitor t(*eval_time2);

        for (std::size_t i = 0; i < worksets.size(); ++i) {
#ifdef PHX_ENABLE_KOKKOS_AMT
	  fm.evaluateFieldsTaskParallel<MyTraits::Jacobian>(1,worksets[i]);
#else
          fm.evaluateFields<MyTraits::Jacobian>(worksets[i]);
#endif
	  Kokkos::deep_copy(host_j_energy_flux,j_energy_flux.get_kokkos_view());
	  Kokkos::deep_copy(host_j_source,j_source.get_kokkos_view());

           for (size_type cell = 0; cell < j_energy_flux.dimension(0); ++cell) {
            for (size_type ip = 0; ip < j_energy_flux.dimension(1); ++ip) {
              for (size_type dim = 0; dim < j_energy_flux.dimension(2); ++dim) {
                std::size_t index = cell * j_energy_flux.dimension(0) +
                  ip * j_energy_flux.dimension(1) + dim;
                j_local_energy_flux_at_qp[index] =  host_j_energy_flux(cell,ip,dim);
              }
            }
          }
          for (size_type cell = 0; cell < j_energy_flux.dimension(0); ++cell) {
            for (size_type ip = 0; ip < j_energy_flux.dimension(1); ++ip) {
              std::size_t index = cell * j_energy_flux.dimension(0) + ip;
              j_local_source_at_qp[index] =  host_j_source(cell,ip);
            }
          }


        }
       }
      fm.postEvaluate<MyTraits::Jacobian>(NULL);

      double scalability = 0.0;
      double parallelizability = 1.0;
      fm.analyzeGraph<MyTraits::Residual>(scalability,parallelizability);
      std::cout << "Residual Scalability = " << scalability << std::endl;
      std::cout << "Residual Parallelizability = " 
		<< parallelizability << std::endl;
      fm.analyzeGraph<MyTraits::Jacobian>(scalability,parallelizability);
      std::cout << "Residual Scalability = " << scalability << std::endl;
      std::cout << "Residual Parallelizability = " 
		<< parallelizability << std::endl;
    }
    
    // *********************************************************************
    // Finished all testing
    // *********************************************************************
    std::cout << "\nRun has completed successfully!\n" << std::endl; 
    // *********************************************************************
    // *********************************************************************

    PHX::FinalizeKokkosDevice();

  }
  catch (const std::exception& e) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Exception Caught!" << endl;
    std::cout << "Error message is below\n " << e.what() << endl;
    std::cout << "************************************************" << endl;
  }
  catch (...) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Unknown Exception Caught!" << endl;
    std::cout << "************************************************" << endl;
  }

  TimeMonitor::summarize();
    
  return 0;
}