int main(int argc, char *argv[]) {
Kokkos::initialize();
  //Check number of arguments
   if (argc < 4) {
      std::cout <<"\n>>> ERROR: Invalid number of arguments.\n\n";
      std::cout <<"Usage:\n\n";
      std::cout <<"  ./Intrepid_example_Drivers_Example_03.exe NX NY NZ verbose\n\n";
      std::cout <<" where \n";
      std::cout <<"   int NX              - num intervals in x direction (assumed box domain, 0,1) \n";
      std::cout <<"   int NY              - num intervals in y direction (assumed box domain, 0,1) \n";
      std::cout <<"   int NZ              - num intervals in z direction (assumed box domain, 0,1) \n";
      std::cout <<"   verbose (optional)  - any character, indicates verbose output \n\n";
      exit(1);
   }
  
  // This little trick lets us print to std::cout only if
  // a (dummy) command-line argument is provided.
  int iprint     = argc - 1;
  Teuchos::RCP<std::ostream> outStream;
  Teuchos::oblackholestream bhs; // outputs nothing
  if (iprint > 3)
    outStream = Teuchos::rcp(&std::cout, false);
  else
    outStream = Teuchos::rcp(&bhs, false);
  
  // Save the format state of the original std::cout.
  Teuchos::oblackholestream oldFormatState;
  oldFormatState.copyfmt(std::cout);
  
  *outStream \
    << "===============================================================================\n" \
    << "|                                                                             |\n" \
    << "|  Example: Generate Stiffness Matrix and Right Hand Side Vector for          |\n" \
    << "|                   Poisson Equation on Hexahedral Mesh                       |\n" \
    << "|                                                                             |\n" \
    << "|  Questions? Contact  Pavel Bochev  ([email protected]),                    |\n" \
    << "|                      Denis Ridzal  ([email protected]),                    |\n" \
    << "|                      Kara Peterson ([email protected]).                    |\n" \
    << "|                                                                             |\n" \
    << "|  Intrepid's website: http://trilinos.sandia.gov/packages/intrepid           |\n" \
    << "|  Trilinos website:   http://trilinos.sandia.gov                             |\n" \
    << "|                                                                             |\n" \
    << "===============================================================================\n";


// ************************************ GET INPUTS **************************************

    int NX            = atoi(argv[1]);  // num intervals in x direction (assumed box domain, 0,1)
    int NY            = atoi(argv[2]);  // num intervals in y direction (assumed box domain, 0,1)
    int NZ            = atoi(argv[3]);  // num intervals in z direction (assumed box domain, 0,1)

// *********************************** CELL TOPOLOGY **********************************

   // Get cell topology for base hexahedron
    typedef shards::CellTopology    CellTopology;
    CellTopology hex_8(shards::getCellTopologyData<shards::Hexahedron<8> >() );

   // Get dimensions 
    int numNodesPerElem = hex_8.getNodeCount();
    int spaceDim = hex_8.getDimension();

// *********************************** GENERATE MESH ************************************

    *outStream << "Generating mesh ... \n\n";

    *outStream << "   NX" << "   NY" << "   NZ\n";
    *outStream << std::setw(5) << NX <<
                 std::setw(5) << NY <<
                 std::setw(5) << NZ << "\n\n";

   // Print mesh information
    int numElems = NX*NY*NZ;
    int numNodes = (NX+1)*(NY+1)*(NZ+1);
    *outStream << " Number of Elements: " << numElems << " \n";
    *outStream << "    Number of Nodes: " << numNodes << " \n\n";

   // Cube
    double leftX = 0.0, rightX = 1.0;
    double leftY = 0.0, rightY = 1.0;
    double leftZ = 0.0, rightZ = 1.0;

   // Mesh spacing
    double hx = (rightX-leftX)/((double)NX);
    double hy = (rightY-leftY)/((double)NY);
    double hz = (rightZ-leftZ)/((double)NZ);

   // Get nodal coordinates
    FieldContainer<double> nodeCoord(numNodes, spaceDim);
    FieldContainer<int> nodeOnBoundary(numNodes);
    int inode = 0;
    for (int k=0; k<NZ+1; k++) {
      for (int j=0; j<NY+1; j++) {
        for (int i=0; i<NX+1; i++) {
          nodeCoord(inode,0) = leftX + (double)i*hx;
          nodeCoord(inode,1) = leftY + (double)j*hy;
          nodeCoord(inode,2) = leftZ + (double)k*hz;
          if (k==0 || j==0 || i==0 || k==NZ || j==NY || i==NX){
             nodeOnBoundary(inode)=1;
          }
          else {
             nodeOnBoundary(inode)=0;
          }
          inode++;
        }
      }
    }
#define DUMP_DATA
#ifdef DUMP_DATA
   // Print nodal coords
    ofstream fcoordout("coords.dat");
    for (int i=0; i<numNodes; i++) {
       fcoordout << nodeCoord(i,0) <<" ";
       fcoordout << nodeCoord(i,1) <<" ";
       fcoordout << nodeCoord(i,2) <<"\n";
    }
    fcoordout.close();
#endif


  // Element to Node map
    FieldContainer<int> elemToNode(numElems, numNodesPerElem);
    int ielem = 0;
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          elemToNode(ielem,0) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i;
          elemToNode(ielem,1) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i + 1;
          elemToNode(ielem,2) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,3) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i;
          elemToNode(ielem,4) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i;
          elemToNode(ielem,5) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i + 1;
          elemToNode(ielem,6) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,7) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i;
          ielem++;
        }
      }
    }
#ifdef DUMP_DATA
   // Output connectivity
    ofstream fe2nout("elem2node.dat");
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          int ielem = i + j * NX + k * NX * NY;
          for (int m=0; m<numNodesPerElem; m++){
              fe2nout << elemToNode(ielem,m) <<"  ";
           }
          fe2nout <<"\n";
        }
      }
    }
    fe2nout.close();
#endif


// ************************************ CUBATURE **************************************

    *outStream << "Getting cubature ... \n\n";

   // Get numerical integration points and weights
    DefaultCubatureFactory<double>  cubFactory;                                   
    int cubDegree = 2;
    Teuchos::RCP<Cubature<double> > hexCub = cubFactory.create(hex_8, cubDegree); 

    int cubDim       = hexCub->getDimension();
    int numCubPoints = hexCub->getNumPoints();

    FieldContainer<double> cubPoints(numCubPoints, cubDim);
    FieldContainer<double> cubWeights(numCubPoints);

    hexCub->getCubature(cubPoints, cubWeights);


// ************************************** BASIS ***************************************

     *outStream << "Getting basis ... \n\n";

   // Define basis 
     Basis_HGRAD_HEX_C1_FEM<double, FieldContainer<double> > hexHGradBasis;
     int numFieldsG = hexHGradBasis.getCardinality();
     FieldContainer<double> hexGVals(numFieldsG, numCubPoints); 
     FieldContainer<double> hexGrads(numFieldsG, numCubPoints, spaceDim); 

  // Evaluate basis values and gradients at cubature points
     hexHGradBasis.getValues(hexGVals, cubPoints, OPERATOR_VALUE);
     hexHGradBasis.getValues(hexGrads, cubPoints, OPERATOR_GRAD);


// ******** LOOP OVER ELEMENTS TO CREATE LOCAL STIFFNESS MATRIX *************

    *outStream << "Building stiffness matrix and right hand side ... \n\n";

 // Settings and data structures for mass and stiffness matrices
    typedef CellTools<double>  CellTools;
    typedef FunctionSpaceTools fst;
    int numCells = 1; 

   // Container for nodes
    FieldContainer<double> hexNodes(numCells, numNodesPerElem, spaceDim);
   // Containers for Jacobian
    FieldContainer<double> hexJacobian(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobInv(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobDet(numCells, numCubPoints);
   // Containers for element HGRAD stiffness matrix
    FieldContainer<double> localStiffMatrix(numCells, numFieldsG, numFieldsG);
    FieldContainer<double> weightedMeasure(numCells, numCubPoints);
    FieldContainer<double> hexGradsTransformed(numCells, numFieldsG, numCubPoints, spaceDim);
    FieldContainer<double> hexGradsTransformedWeighted(numCells, numFieldsG, numCubPoints, spaceDim);
   // Containers for right hand side vectors
    FieldContainer<double> rhsData(numCells, numCubPoints);
    FieldContainer<double> localRHS(numCells, numFieldsG);
    FieldContainer<double> hexGValsTransformed(numCells, numFieldsG, numCubPoints);
    FieldContainer<double> hexGValsTransformedWeighted(numCells, numFieldsG, numCubPoints);
   // Container for cubature points in physical space
    FieldContainer<double> physCubPoints(numCells, numCubPoints, cubDim);

   // Global arrays in Epetra format 
    Epetra_SerialComm Comm;
    Epetra_Map globalMapG(numNodes, 0, Comm);
    Epetra_FECrsMatrix StiffMatrix(Copy, globalMapG, numFieldsG);
    Epetra_FEVector rhs(globalMapG);

 // *** Element loop ***
    for (int k=0; k<numElems; k++) {

     // Physical cell coordinates
      for (int i=0; i<numNodesPerElem; i++) {
         hexNodes(0,i,0) = nodeCoord(elemToNode(k,i),0);
         hexNodes(0,i,1) = nodeCoord(elemToNode(k,i),1);
         hexNodes(0,i,2) = nodeCoord(elemToNode(k,i),2);
      }

    // Compute cell Jacobians, their inverses and their determinants
       CellTools::setJacobian(hexJacobian, cubPoints, hexNodes, hex_8);
       CellTools::setJacobianInv(hexJacobInv, hexJacobian );
       CellTools::setJacobianDet(hexJacobDet, hexJacobian );

// ************************** Compute element HGrad stiffness matrices *******************************
  
     // transform to physical coordinates 
      fst::HGRADtransformGRAD<double>(hexGradsTransformed, hexJacobInv, hexGrads);
      
     // compute weighted measure
      fst::computeCellMeasure<double>(weightedMeasure, hexJacobDet, cubWeights);

     // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGradsTransformedWeighted,
                                   weightedMeasure, hexGradsTransformed);

     // integrate to compute element stiffness matrix
      fst::integrate<double>(localStiffMatrix,
                             hexGradsTransformed, hexGradsTransformedWeighted, COMP_BLAS);

      // assemble into global matrix
      for (int row = 0; row < numFieldsG; row++){
        for (int col = 0; col < numFieldsG; col++){
            int rowIndex = elemToNode(k,row);
            int colIndex = elemToNode(k,col);
            double val = localStiffMatrix(0,row,col);
            StiffMatrix.InsertGlobalValues(1, &rowIndex, 1, &colIndex, &val);
         }
      }

// ******************************* Build right hand side ************************************

      // transform integration points to physical points
       CellTools::mapToPhysicalFrame(physCubPoints, cubPoints, hexNodes, hex_8);

      // evaluate right hand side function at physical points
       for (int nPt = 0; nPt < numCubPoints; nPt++){

          double x = physCubPoints(0,nPt,0);
          double y = physCubPoints(0,nPt,1);
          double z = physCubPoints(0,nPt,2);

          rhsData(0,nPt) = evalDivGradu(x, y, z);
       }

     // transform basis values to physical coordinates 
      fst::HGRADtransformVALUE<double>(hexGValsTransformed, hexGVals);

     // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGValsTransformedWeighted,
                                   weightedMeasure, hexGValsTransformed);

     // integrate rhs term
      fst::integrate<double>(localRHS, rhsData, hexGValsTransformedWeighted, 
                             COMP_BLAS);

    // assemble into global vector
     for (int row = 0; row < numFieldsG; row++){
           int rowIndex = elemToNode(k,row);
           double val = -localRHS(0,row);
           rhs.SumIntoGlobalValues(1, &rowIndex, &val);
      }
     
 } // *** end element loop ***


  // Assemble global matrices
   StiffMatrix.GlobalAssemble(); StiffMatrix.FillComplete();
   rhs.GlobalAssemble();

 
  // Adjust stiffness matrix and rhs based on boundary conditions
   for (int row = 0; row<numNodes; row++){
       if (nodeOnBoundary(row)) {
          int rowindex = row;
          for (int col=0; col<numNodes; col++){
              double val = 0.0;
              int colindex = col;
              StiffMatrix.ReplaceGlobalValues(1, &rowindex, 1, &colindex, &val);
          }
          double val = 1.0;
          StiffMatrix.ReplaceGlobalValues(1, &rowindex, 1, &rowindex, &val);
          val = 0.0;
          rhs.ReplaceGlobalValues(1, &rowindex, &val);
       }
    }

#ifdef DUMP_DATA
   // Dump matrices to disk
     EpetraExt::RowMatrixToMatlabFile("stiff_matrix.dat",StiffMatrix);
     EpetraExt::MultiVectorToMatrixMarketFile("rhs_vector.dat",rhs,0,0,false);
#endif

   std::cout << "End Result: TEST PASSED\n";
   
   // reset format state of std::cout
   std::cout.copyfmt(oldFormatState);
   Kokkos::finalize();
   return 0;
}
int main(int argc, char *argv[]) {

    Teuchos::GlobalMPISession mpiSession(&argc, &argv);
    Kokkos::initialize();
    // This little trick lets us print to std::cout only if
    // a (dummy) command-line argument is provided.
    int iprint     = argc - 1;
    Teuchos::RCP<std::ostream> outStream;
    Teuchos::oblackholestream bhs; // outputs nothing
    if (iprint > 0)
        outStream = Teuchos::rcp(&std::cout, false);
    else
        outStream = Teuchos::rcp(&bhs, false);

    // Save the format state of the original std::cout.
    Teuchos::oblackholestream oldFormatState;
    oldFormatState.copyfmt(std::cout);

    *outStream \
            << "===============================================================================\n" \
            << "|                                                                             |\n" \
            << "|                 Unit Test (Basis_HGRAD_HEX_C1_FEM)                          |\n" \
            << "|                                                                             |\n" \
            << "|     1) Conversion of Dof tags into Dof ordinals and back                    |\n" \
            << "|     2) Basis values for VALUE, GRAD, CURL, and Dk operators                 |\n" \
            << "|                                                                             |\n" \
            << "|  Questions? Contact  Pavel Bochev  ([email protected]),                    |\n" \
            << "|                      Denis Ridzal  ([email protected]),                    |\n" \
            << "|                      Kara Peterson ([email protected]).                    |\n" \
            << "|                                                                             |\n" \
            << "|  Intrepid's website: http://trilinos.sandia.gov/packages/intrepid           |\n" \
            << "|  Trilinos website:   http://trilinos.sandia.gov                             |\n" \
            << "|                                                                             |\n" \
            << "===============================================================================\n"\
            << "| TEST 1: Basis creation, exception testing                                   |\n"\
            << "===============================================================================\n";

    // Define basis and error flag
    Basis_HGRAD_HEX_C1_FEM<double, FieldContainer<double> > hexBasis;
    int errorFlag = 0;

    // Initialize throw counter for exception testing
    int nException     = 0;
    int throwCounter   = 0;

    // Define array containing the 8 vertices of the reference HEX, its center and 6 face centers
    FieldContainer<double> hexNodes(15, 3);
    hexNodes(0,0) = -1.0;
    hexNodes(0,1) = -1.0;
    hexNodes(0,2) = -1.0;
    hexNodes(1,0) =  1.0;
    hexNodes(1,1) = -1.0;
    hexNodes(1,2) = -1.0;
    hexNodes(2,0) =  1.0;
    hexNodes(2,1) =  1.0;
    hexNodes(2,2) = -1.0;
    hexNodes(3,0) = -1.0;
    hexNodes(3,1) =  1.0;
    hexNodes(3,2) = -1.0;

    hexNodes(4,0) = -1.0;
    hexNodes(4,1) = -1.0;
    hexNodes(4,2) =  1.0;
    hexNodes(5,0) =  1.0;
    hexNodes(5,1) = -1.0;
    hexNodes(5,2) =  1.0;
    hexNodes(6,0) =  1.0;
    hexNodes(6,1) =  1.0;
    hexNodes(6,2) =  1.0;
    hexNodes(7,0) = -1.0;
    hexNodes(7,1) =  1.0;
    hexNodes(7,2) =  1.0;

    hexNodes(8,0) =  0.0;
    hexNodes(8,1) =  0.0;
    hexNodes(8,2) =  0.0;

    hexNodes(9,0) =  1.0;
    hexNodes(9,1) =  0.0;
    hexNodes(9,2) =  0.0;
    hexNodes(10,0)= -1.0;
    hexNodes(10,1)=  0.0;
    hexNodes(10,2)=  0.0;

    hexNodes(11,0)=  0.0;
    hexNodes(11,1)=  1.0;
    hexNodes(11,2)=  0.0;
    hexNodes(12,0)=  0.0;
    hexNodes(12,1)= -1.0;
    hexNodes(12,2)=  0.0;

    hexNodes(13,0)=  0.0;
    hexNodes(13,1)=  0.0;
    hexNodes(13,2)=  1.0;
    hexNodes(14,0)=  0.0;
    hexNodes(14,1)=  0.0;
    hexNodes(14,2)= -1.0;


    // Generic array for the output values; needs to be properly resized depending on the operator type
    FieldContainer<double> vals;

    try {
        // exception #1: CURL cannot be applied to scalar functions in 3D
        // resize vals to rank-3 container with dimensions (num. basis functions, num. points, arbitrary)
        vals.resize(hexBasis.getCardinality(), hexNodes.dimension(0), 4 );
        INTREPID_TEST_COMMAND( hexBasis.getValues(vals, hexNodes, OPERATOR_CURL), throwCounter, nException );

        // exception #2: DIV cannot be applied to scalar functions in 3D
        // resize vals to rank-2 container with dimensions (num. basis functions, num. points)
        vals.resize(hexBasis.getCardinality(), hexNodes.dimension(0) );
        INTREPID_TEST_COMMAND( hexBasis.getValues(vals, hexNodes, OPERATOR_DIV), throwCounter, nException );

        // Exceptions 3-7: all bf tags/bf Ids below are wrong and should cause getDofOrdinal() and
        // getDofTag() to access invalid array elements thereby causing bounds check exception
        // exception #3
        INTREPID_TEST_COMMAND( hexBasis.getDofOrdinal(3,0,0), throwCounter, nException );
        // exception #4
        INTREPID_TEST_COMMAND( hexBasis.getDofOrdinal(1,1,1), throwCounter, nException );
        // exception #5
        INTREPID_TEST_COMMAND( hexBasis.getDofOrdinal(0,4,1), throwCounter, nException );
        // exception #6
        INTREPID_TEST_COMMAND( hexBasis.getDofTag(8), throwCounter, nException );
        // exception #7
        INTREPID_TEST_COMMAND( hexBasis.getDofTag(-1), throwCounter, nException );

#ifdef HAVE_INTREPID2_DEBUG
        // Exceptions 8-18 test exception handling with incorrectly dimensioned input/output arrays
        // exception #8: input points array must be of rank-2
        FieldContainer<double> badPoints1(4, 5, 3);
        INTREPID_TEST_COMMAND( hexBasis.getValues(vals, badPoints1, OPERATOR_VALUE), throwCounter, nException );

        // exception #9 dimension 1 in the input point array must equal space dimension of the cell
        FieldContainer<double> badPoints2(4, 2);
        INTREPID_TEST_COMMAND( hexBasis.getValues(vals, badPoints2, OPERATOR_VALUE), throwCounter, nException );

        // exception #10 output values must be of rank-2 for OPERATOR_VALUE
        FieldContainer<double> badVals1(4, 3, 1);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals1, hexNodes, OPERATOR_VALUE), throwCounter, nException );

        // exception #11 output values must be of rank-3 for OPERATOR_GRAD
        FieldContainer<double> badVals2(4, 3);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals2, hexNodes, OPERATOR_GRAD), throwCounter, nException );

        // exception #12 output values must be of rank-3 for OPERATOR_D1
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals2, hexNodes, OPERATOR_D1), throwCounter, nException );

        // exception #13 output values must be of rank-3 for OPERATOR_D2
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals2, hexNodes, OPERATOR_D2), throwCounter, nException );

        // exception #14 incorrect 0th dimension of output array (must equal number of basis functions)
        FieldContainer<double> badVals3(hexBasis.getCardinality() + 1, hexNodes.dimension(0));
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals3, hexNodes, OPERATOR_VALUE), throwCounter, nException );

        // exception #15 incorrect 1st dimension of output array (must equal number of points)
        FieldContainer<double> badVals4(hexBasis.getCardinality(), hexNodes.dimension(0) + 1);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals4, hexNodes, OPERATOR_VALUE), throwCounter, nException );

        // exception #16: incorrect 2nd dimension of output array (must equal the space dimension)
        FieldContainer<double> badVals5(hexBasis.getCardinality(), hexNodes.dimension(0), 4);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals5, hexNodes, OPERATOR_GRAD), throwCounter, nException );

        // exception #17: incorrect 2nd dimension of output array (must equal D2 cardinality in 3D)
        FieldContainer<double> badVals6(hexBasis.getCardinality(), hexNodes.dimension(0), 40);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals6, hexNodes, OPERATOR_D2), throwCounter, nException );

        // exception #18: incorrect 2nd dimension of output array (must equal D3 cardinality in 3D)
        FieldContainer<double> badVals7(hexBasis.getCardinality(), hexNodes.dimension(0), 50);
        INTREPID_TEST_COMMAND( hexBasis.getValues(badVals7, hexNodes, OPERATOR_D3), throwCounter, nException );
#endif

    }
    catch (std::logic_error err) {
        *outStream << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n";
        *outStream << err.what() << '\n';
        *outStream << "-------------------------------------------------------------------------------" << "\n\n";
        errorFlag = -1000;
    };

    // Check if number of thrown exceptions matches the one we expect
    // Note Teuchos throw number will not pick up exceptions 3-7 and therefore will not match.
    if (throwCounter != nException) {
        errorFlag++;
        *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";
    }

    *outStream \
            << "\n"
            << "===============================================================================\n"\
            << "| TEST 2: correctness of tag to enum and enum to tag lookups                  |\n"\
            << "===============================================================================\n";

    try {
        std::vector<std::vector<int> > allTags = hexBasis.getAllDofTags();

        // Loop over all tags, lookup the associated dof enumeration and then lookup the tag again
        for (unsigned i = 0; i < allTags.size(); i++) {
            int bfOrd  = hexBasis.getDofOrdinal(allTags[i][0], allTags[i][1], allTags[i][2]);

            std::vector<int> myTag = hexBasis.getDofTag(bfOrd);
            if( !( (myTag[0] == allTags[i][0]) &&
                    (myTag[1] == allTags[i][1]) &&
                    (myTag[2] == allTags[i][2]) &&
                    (myTag[3] == allTags[i][3]) ) ) {
                errorFlag++;
                *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";
                *outStream << " getDofOrdinal( {"
                           << allTags[i][0] << ", "
                           << allTags[i][1] << ", "
                           << allTags[i][2] << ", "
                           << allTags[i][3] << "}) = " << bfOrd <<" but \n";
                *outStream << " getDofTag(" << bfOrd << ") = { "
                           << myTag[0] << ", "
                           << myTag[1] << ", "
                           << myTag[2] << ", "
                           << myTag[3] << "}\n";
            }
        }

        // Now do the same but loop over basis functions
        for( int bfOrd = 0; bfOrd < hexBasis.getCardinality(); bfOrd++) {
            std::vector<int> myTag  = hexBasis.getDofTag(bfOrd);
            int myBfOrd = hexBasis.getDofOrdinal(myTag[0], myTag[1], myTag[2]);
            if( bfOrd != myBfOrd) {
                errorFlag++;
                *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";
                *outStream << " getDofTag(" << bfOrd << ") = { "
                           << myTag[0] << ", "
                           << myTag[1] << ", "
                           << myTag[2] << ", "
                           << myTag[3] << "} but getDofOrdinal({"
                           << myTag[0] << ", "
                           << myTag[1] << ", "
                           << myTag[2] << ", "
                           << myTag[3] << "} ) = " << myBfOrd << "\n";
            }
        }
    }
    catch (std::logic_error err) {
        *outStream << err.what() << "\n\n";
        errorFlag = -1000;
    };

    *outStream \
            << "\n"
            << "===============================================================================\n"\
            << "| TEST 3: correctness of basis function values                                |\n"\
            << "===============================================================================\n";

    outStream -> precision(20);

    // VALUE: Each row gives the 8 correct basis set values at an evaluation point
    double basisValues[] = {
        // bottom 4 vertices
        1.0, 0.0, 0.0, 0.0,  0.0, 0.0, 0.0, 0.0,
        0.0, 1.0, 0.0, 0.0,  0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 1.0, 0.0,  0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0,  0.0, 0.0, 0.0, 0.0,
        // top 4 vertices
        0.0, 0.0, 0.0, 0.0,  1.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,  0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0,  0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 0.0,  0.0, 0.0, 0.0, 1.0,
        // center {0, 0, 0}
        0.125, 0.125, 0.125, 0.125,  0.125, 0.125, 0.125, 0.125,
        // faces { 1, 0, 0} and {-1, 0, 0}
        0.0,   0.25,  0.25,  0.0,    0.0,   0.25,  0.25,  0.0,
        0.25,  0.0,   0.0,   0.25,   0.25,  0.0,   0.0,   0.25,
        // faces { 0, 1, 0} and { 0,-1, 0}
        0.0,   0.0,   0.25,  0.25,   0.0,   0.0,   0.25,  0.25,
        0.25,  0.25,  0.0,   0.0,    0.25,  0.25,  0.0,   0.0,
        // faces {0, 0, 1} and {0, 0, -1}
        0.0,   0.0,   0.0,   0.0,    0.25,  0.25,  0.25,  0.25,
        0.25,  0.25,  0.25,  0.25,   0.0,   0.0,   0.0,   0.0,
    };

    // GRAD and D1: each row gives the 3x8 correct values of the gradients of the 8 basis functions
    double basisGrads[] = {
        // points 0-3
        -0.5,-0.5,-0.5,  0.5, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.5, 0.0,  0.0, 0.0, 0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,
        -0.5, 0.0, 0.0,  0.5,-0.5,-0.5,  0.0, 0.5, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,
        0.0, 0.0, 0.0,  0.0,-0.5, 0.0,  0.5, 0.5,-0.5, -0.5, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.5,  0.0, 0.0, 0.0,
        0.0,-0.5, 0.0,  0.0, 0.0, 0.0,  0.5, 0.0, 0.0, -0.5, 0.5,-0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.5,
        // points 4-7
        0.0, 0.0,-0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0, -0.5,-0.5, 0.5,  0.5, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.5, 0.0,
        0.0, 0.0, 0.0,  0.0, 0.0,-0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0, -0.5, 0.0, 0.0,  0.5,-0.5, 0.5,  0.0, 0.5, 0.0,  0.0, 0.0, 0.0,
        0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0,-0.5,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0,-0.5, 0.0,  0.5, 0.5, 0.5, -0.5, 0.0, 0.0,
        0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0, 0.0,  0.0, 0.0,-0.5,  0.0,-0.5, 0.0,  0.0, 0.0, 0.0,  0.5, 0.0, 0.0, -0.5, 0.5, 0.5,
        // point 8
        -0.125,-0.125,-0.125,  0.125,-0.125,-0.125,  0.125, 0.125,-0.125, \
        -0.125, 0.125,-0.125, -0.125,-0.125, 0.125,  0.125,-0.125, 0.125, \
        0.125, 0.125, 0.125, -0.125, 0.125, 0.125,
        // point 9
        -0.125, 0.0,   0.0,    0.125,-0.25, -0.25,   0.125, 0.25, -0.25,  -0.125, 0.0, 0.0, \
        -0.125, 0.0,   0.0,    0.125,-0.25,  0.25,   0.125, 0.25,  0.25,  -0.125, 0.0, 0.0,
        // point 10
        -0.125,-0.25, -0.25,   0.125, 0.0,   0.0,    0.125, 0.0,   0.0,   -0.125, 0.25, -0.25,\
        -0.125,-0.25,  0.25,   0.125, 0.0,   0.0,    0.125, 0.0,   0.0,   -0.125, 0.25,  0.25,
        // point 11
        0.0,  -0.125, 0.0,    0.0,  -0.125, 0.0,    0.25,  0.125,-0.25,  -0.25,  0.125,-0.25,\
        0.0,  -0.125, 0.0,    0.0,  -0.125, 0.0,    0.25,  0.125, 0.25,  -0.25,  0.125, 0.25,
        // point 12
        -0.25, -0.125,-0.25,   0.25, -0.125,-0.25,   0.0,   0.125, 0.0,    0.0,   0.125, 0.0, \
        -0.25, -0.125, 0.25,   0.25, -0.125, 0.25,   0.0,   0.125, 0.0,    0.0,   0.125, 0.0,
        // point 13
        0.0,   0.0,  -0.125,  0.0,   0.0,  -0.125,  0.0,   0.0,  -0.125,  0.0,   0.0,  -0.125, \
        -0.25, -0.25,  0.125,  0.25, -0.25,  0.125,  0.25,  0.25,  0.125, -0.25,  0.25,  0.125,
        // point 14
        -0.25, -0.25, -0.125,  0.25, -0.25, -0.125,  0.25,  0.25, -0.125, -0.25,  0.25, -0.125, \
        0.0,   0.0,   0.125,  0.0,   0.0,   0.125,  0.0,   0.0,   0.125,  0.0,   0.0,   0.125
    };

    //D2: flat array with the values of D2 applied to basis functions. Multi-index is (P,F,K)
    double basisD2[] = {
        // point 0
        0, 0.25, 0.25, 0, 0.25, 0, 0, -0.25, -0.25, 0, 0., 0, 0, 0.25, 0., 0, \
        0., 0, 0, -0.25, 0., 0, -0.25, 0, 0, 0., -0.25, 0, -0.25, 0, 0, 0., \
        0.25, 0, 0., 0, 0, 0., 0., 0, 0., 0, 0, 0., 0., 0, 0.25, 0., \
        // point 1
        0, 0.25, 0.25, 0, 0., 0, 0, -0.25, -0.25, 0, 0.25, 0, 0, 0.25, 0., 0, \
        -0.25, 0, 0, -0.25, 0., 0, 0., 0, 0, 0., -0.25, 0, 0., 0, 0, 0., \
        0.25, 0, -0.25, 0, 0, 0., 0., 0, 0.25, 0, 0, 0., 0., 0, 0., 0., \
        // Point 2
        0, 0.25, 0., 0, 0., 0, 0, -0.25, 0., 0, 0.25, 0, 0, 0.25, -0.25, 0, \
        -0.25, 0, 0, -0.25, 0.25, 0, 0., 0, 0, 0., 0., 0, 0., 0, 0, 0., 0., \
        0, -0.25, 0, 0, 0., 0.25, 0, 0.25, 0, 0, 0., -0.25, 0, 0., 0., \
        // Point 3
        0, 0.25, 0., 0, 0.25, 0, 0, -0.25, 0., 0, 0., 0, 0, 0.25, -0.25, 0, \
        0., 0, 0, -0.25, 0.25, 0, -0.25, 0, 0, 0., 0., 0, -0.25, 0, 0, 0., \
        0., 0, 0., 0, 0, 0., 0.25, 0, 0., 0, 0, 0., -0.25, 0, 0.25, 0.,\
        // Point 4
        0, 0., 0.25, 0, 0.25, 0, 0, 0., -0.25, 0, 0., 0, 0, 0., 0., 0, 0., 0, \
        0, 0., 0., 0, -0.25, 0, 0, 0.25, -0.25, 0, -0.25, 0, 0, -0.25, 0.25, \
        0, 0., 0, 0, 0.25, 0., 0, 0., 0, 0, -0.25, 0., 0, 0.25, 0., \
        // Point 5
        0, 0., 0.25, 0, 0., 0, 0, 0., -0.25, 0, 0.25, 0, 0, 0., 0., 0, -0.25, \
        0, 0, 0., 0., 0, 0., 0, 0, 0.25, -0.25, 0, 0., 0, 0, -0.25, 0.25, 0, \
        -0.25, 0, 0, 0.25, 0., 0, 0.25, 0, 0, -0.25, 0., 0, 0., 0., \
        // Point 6
        0, 0., 0., 0, 0., 0, 0, 0., 0., 0, 0.25, 0, 0, 0., -0.25, 0, -0.25, \
        0, 0, 0., 0.25, 0, 0., 0, 0, 0.25, 0., 0, 0., 0, 0, -0.25, 0., 0, \
        -0.25, 0, 0, 0.25, 0.25, 0, 0.25, 0, 0, -0.25, -0.25, 0, 0., 0., \
        // Point 7
        0, 0., 0., 0, 0.25, 0, 0, 0., 0., 0, 0., 0, 0, 0., -0.25, 0, 0., 0, \
        0, 0., 0.25, 0, -0.25, 0, 0, 0.25, 0., 0, -0.25, 0, 0, -0.25, 0., 0, \
        0., 0, 0, 0.25, 0.25, 0, 0., 0, 0, -0.25, -0.25, 0, 0.25, 0., \
        // Point 8
        0, 0.125, 0.125, 0, 0.125, 0, 0, -0.125, -0.125, 0, 0.125, 0, 0, \
        0.125, -0.125, 0, -0.125, 0, 0, -0.125, 0.125, 0, -0.125, 0, 0, \
        0.125, -0.125, 0, -0.125, 0, 0, -0.125, 0.125, 0, -0.125, 0, 0, \
        0.125, 0.125, 0, 0.125, 0, 0, -0.125, -0.125, 0, 0.125, 0., \
        // Point 9
        0, 0.125, 0.125, 0, 0., 0, 0, -0.125, -0.125, 0, 0.25, 0, 0, 0.125, \
        -0.125, 0, -0.25, 0, 0, -0.125, 0.125, 0, 0., 0, 0, 0.125, -0.125, 0, \
        0., 0, 0, -0.125, 0.125, 0, -0.25, 0, 0, 0.125, 0.125, 0, 0.25, 0, 0, \
        -0.125, -0.125, 0, 0., 0., \
        // Point 10
        0, 0.125, 0.125, 0, 0.25, 0, 0, -0.125, -0.125, 0, 0., 0, 0, 0.125, \
        -0.125, 0, 0., 0, 0, -0.125, 0.125, 0, -0.25, 0, 0, 0.125, -0.125, 0, \
        -0.25, 0, 0, -0.125, 0.125, 0, 0., 0, 0, 0.125, 0.125, 0, 0., 0, 0, \
        -0.125, -0.125, 0, 0.25, 0., \
        // Point 11
        0, 0.125, 0., 0, 0.125, 0, 0, -0.125, 0., 0, 0.125, 0, 0, 0.125, \
        -0.25, 0, -0.125, 0, 0, -0.125, 0.25, 0, -0.125, 0, 0, 0.125, 0., 0, \
        -0.125, 0, 0, -0.125, 0., 0, -0.125, 0, 0, 0.125, 0.25, 0, 0.125, 0, \
        0, -0.125, -0.25, 0, 0.125, 0., \
        // Point 12
        0, 0.125, 0.25, 0, 0.125, 0, 0, -0.125, -0.25, 0, 0.125, 0, 0, 0.125, \
        0., 0, -0.125, 0, 0, -0.125, 0., 0, -0.125, 0, 0, 0.125, -0.25, 0, \
        -0.125, 0, 0, -0.125, 0.25, 0, -0.125, 0, 0, 0.125, 0., 0, 0.125, 0, \
        0, -0.125, 0., 0, 0.125, 0., \
        // Point 13
        0, 0., 0.125, 0, 0.125, 0, 0, 0., -0.125, 0, 0.125, 0, 0, 0., -0.125, \
        0, -0.125, 0, 0, 0., 0.125, 0, -0.125, 0, 0, 0.25, -0.125, 0, -0.125, \
        0, 0, -0.25, 0.125, 0, -0.125, 0, 0, 0.25, 0.125, 0, 0.125, 0, 0, \
        -0.25, -0.125, 0, 0.125, 0., \
        // Point 14
        0, 0.25, 0.125, 0, 0.125, 0, 0, -0.25, -0.125, 0, 0.125, 0, 0, 0.25, \
        -0.125, 0, -0.125, 0, 0, -0.25, 0.125, 0, -0.125, 0, 0, 0., -0.125, \
        0, -0.125, 0, 0, 0., 0.125, 0, -0.125, 0, 0, 0., 0.125, 0, 0.125, 0, \
        0, 0., -0.125, 0, 0.125, 0.
    };

    try {

        // Dimensions for the output arrays:
        int numFields = hexBasis.getCardinality();
        int numPoints = hexNodes.dimension(0);
        int spaceDim  = hexBasis.getBaseCellTopology().getDimension();
        int D2Cardin  = Intrepid2::getDkCardinality(OPERATOR_D2, spaceDim);

        // Generic array for values, grads, curls, etc. that will be properly sized before each call
        FieldContainer<double> vals;

        // Check VALUE of basis functions: resize vals to rank-2 container:
        vals.resize(numFields, numPoints);
        hexBasis.getValues(vals, hexNodes, OPERATOR_VALUE);
        for (int i = 0; i < numFields; i++) {
            for (int j = 0; j < numPoints; j++) {
                int l =  i + j * numFields;
                if (std::abs(vals(i,j) - basisValues[l]) > INTREPID_TOL) {
                    errorFlag++;
                    *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";

                    // Output the multi-index of the value where the error is:
                    *outStream << " At multi-index { ";
                    *outStream << i << " ";
                    *outStream << j << " ";
                    *outStream << "}  computed value: " << vals(i,j)
                               << " but reference value: " << basisValues[l] << "\n";
                }
            }
        }

        // Check GRAD of basis function: resize vals to rank-3 container
        vals.resize(numFields, numPoints, spaceDim);
        hexBasis.getValues(vals, hexNodes, OPERATOR_GRAD);
        for (int i = 0; i < numFields; i++) {
            for (int j = 0; j < numPoints; j++) {
                for (int k = 0; k < spaceDim; k++) {
                    int l = k + i * spaceDim + j * spaceDim * numFields;
                    if (std::abs(vals(i,j,k) - basisGrads[l]) > INTREPID_TOL) {
                        errorFlag++;
                        *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";

                        // Output the multi-index of the value where the error is:
                        *outStream << " At multi-index { ";
                        *outStream << i << " ";
                        *outStream << j << " ";
                        *outStream << k << " ";
                        *outStream << "}  computed grad component: " << vals(i,j,k)
                                   << " but reference grad component: " << basisGrads[l] << "\n";
                    }
                }
            }
        }

        // Check D1 of basis function (do not resize vals because it has the correct size: D1 = GRAD)
        hexBasis.getValues(vals, hexNodes, OPERATOR_D1);
        for (int i = 0; i < numFields; i++) {
            for (int j = 0; j < numPoints; j++) {
                for (int k = 0; k < spaceDim; k++) {
                    int l = k + i * spaceDim + j * spaceDim * numFields;
                    if (std::abs(vals(i,j,k) - basisGrads[l]) > INTREPID_TOL) {
                        errorFlag++;
                        *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";

                        // Output the multi-index of the value where the error is:
                        *outStream << " At multi-index { ";
                        *outStream << i << " ";
                        *outStream << j << " ";
                        *outStream << k << " ";
                        *outStream << "}  computed D1 component: " << vals(i,j,k)
                                   << " but reference D1 component: " << basisGrads[l] << "\n";
                    }
                }
            }
        }


        // Check D2 of basis function
        vals.resize(numFields, numPoints, D2Cardin);
        hexBasis.getValues(vals, hexNodes, OPERATOR_D2);
        for (int i = 0; i < numFields; i++) {
            for (int j = 0; j < numPoints; j++) {
                for (int k = 0; k < D2Cardin; k++) {
                    int l = k + i * D2Cardin + j * D2Cardin * numFields;
                    if (std::abs(vals(i,j,k) - basisD2[l]) > INTREPID_TOL) {
                        errorFlag++;
                        *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";

                        // Output the multi-index of the value where the error is:
                        *outStream << " At multi-index { ";
                        *outStream << i << " ";
                        *outStream << j << " ";
                        *outStream << k << " ";
                        *outStream << "}  computed D2 component: " << vals(i,j,k)
                                   << " but reference D2 component: " << basisD2[l] << "\n";
                    }
                }
            }
        }

        // Check all higher derivatives - must be zero.
        for(EOperator op = OPERATOR_D3; op < OPERATOR_MAX; op++) {

            // The last dimension is the number of kth derivatives and needs to be resized for every Dk
            int DkCardin  = Intrepid2::getDkCardinality(op, spaceDim);
            vals.resize(numFields, numPoints, DkCardin);

            hexBasis.getValues(vals, hexNodes, op);
            for (int i1 = 0; i1 < numFields; i1++)
                for (int i2 = 0; i2 < numPoints; i2++)
                    for (int i3 = 0; i3 < DkCardin; i3++) {
                        if (std::abs(vals(i1,i2,i3)) > INTREPID_TOL) {
                            errorFlag++;
                            *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";

                            // Get the multi-index of the value where the error is and the operator order
                            int ord = Intrepid2::getOperatorOrder(op);
                            *outStream << " At multi-index { "<<i1<<" "<<i2 <<" "<<i3;
                            *outStream << "}  computed D"<< ord <<" component: " << vals(i1,i2,i3)
                                       << " but reference D" << ord << " component:  0 \n";
                        }
                    }
        }
    }

    // Catch unexpected errors
    catch (std::logic_error err) {
        *outStream << err.what() << "\n\n";
        errorFlag = -1000;
    };

    *outStream \
            << "\n"
            << "===============================================================================\n"\
            << "| TEST 4: correctness of DoF locations                                        |\n"\
            << "===============================================================================\n";

    try {
        Teuchos::RCP<Basis<double, FieldContainer<double> > > basis =
            Teuchos::rcp(new Basis_HGRAD_HEX_C1_FEM<double, FieldContainer<double> >);
        Teuchos::RCP<DofCoordsInterface<FieldContainer<double> > > coord_iface =
            Teuchos::rcp_dynamic_cast<DofCoordsInterface<FieldContainer<double> > >(basis);

        FieldContainer<double> cvals;
        FieldContainer<double> bvals(basis->getCardinality(), basis->getCardinality());

        // Check exceptions.
#ifdef HAVE_INTREPID2_DEBUG
        cvals.resize(1,2,3);
        INTREPID_TEST_COMMAND( coord_iface->getDofCoords(cvals), throwCounter, nException );
        cvals.resize(3,2);
        INTREPID_TEST_COMMAND( coord_iface->getDofCoords(cvals), throwCounter, nException );
        cvals.resize(8,2);
        INTREPID_TEST_COMMAND( coord_iface->getDofCoords(cvals), throwCounter, nException );
#endif
        cvals.resize(8,3);
        INTREPID_TEST_COMMAND( coord_iface->getDofCoords(cvals), throwCounter, nException );
        nException--;
        // Check if number of thrown exceptions matches the one we expect
        if (throwCounter != nException) {
            errorFlag++;
            *outStream << std::setw(70) << "^^^^----FAILURE!" << "\n";
        }

        // Check mathematical correctness.
        basis->getValues(bvals, cvals, OPERATOR_VALUE);
        char buffer[120];
        for (int i=0; i<bvals.dimension(0); i++) {
            for (int j=0; j<bvals.dimension(1); j++) {
                if ((i != j) && (std::abs(bvals(i,j) - 0.0) > INTREPID_TOL)) {
                    errorFlag++;
                    sprintf(buffer, "\nValue of basis function %d at (%6.4e, %6.4e, %6.4e) is %6.4e but should be %6.4e!\n", i, cvals(i,0), cvals(i,1), cvals(i,2), bvals(i,j), 0.0);
                    *outStream << buffer;
                }
                else if ((i == j) && (std::abs(bvals(i,j) - 1.0) > INTREPID_TOL)) {
                    errorFlag++;
                    sprintf(buffer, "\nValue of basis function %d at (%6.4e, %6.4e, %6.4e) is %6.4e but should be %6.4e!\n", i, cvals(i,0), cvals(i,1), cvals(i,2), bvals(i,j), 1.0);
                    *outStream << buffer;
                }
            }
        }

    }
    catch (std::logic_error err) {
        *outStream << err.what() << "\n\n";
        errorFlag = -1000;
    };

    if (errorFlag != 0)
        std::cout << "End Result: TEST FAILED\n";
    else
        std::cout << "End Result: TEST PASSED\n";

    // reset format state of std::cout
    std::cout.copyfmt(oldFormatState);
    Kokkos::finalize();
    return errorFlag;
}
int main(int argc, char *argv[]) {

   //Check number of arguments
   if (argc < 13) {
      std::cout <<"\n>>> ERROR: Invalid number of arguments.\n\n";
      std::cout <<"Usage:\n\n";
      std::cout <<"  ./Intrepid_example_Drivers_Example_01.exe NX NY NZ randomMesh mu1 mu2 mu1LX mu1RX mu1LY mu1RY mu1LZ mu1RZ verbose\n\n";
      std::cout <<" where \n";
      std::cout <<"   int NX              - num intervals in x direction (assumed box domain, -1,1) \n";
      std::cout <<"   int NY              - num intervals in y direction (assumed box domain, -1,1) \n";
      std::cout <<"   int NZ              - num intervals in z direction (assumed box domain, -1,1) \n";
      std::cout <<"   int randomMesh      - 1 if mesh randomizer is to be used 0 if not \n";
      std::cout <<"   double mu1          - material property value for region 1 \n";
      std::cout <<"   double mu2          - material property value for region 2 \n";
      std::cout <<"   double mu1LX        - left X boundary for region 1 \n";
      std::cout <<"   double mu1RX        - right X boundary for region 1 \n";
      std::cout <<"   double mu1LY        - left Y boundary for region 1 \n";
      std::cout <<"   double mu1RY        - right Y boundary for region 1 \n";
      std::cout <<"   double mu1LZ        - bottom Z boundary for region 1 \n";
      std::cout <<"   double mu1RZ        - top Z boundary for region 1 \n";
      std::cout <<"   verbose (optional)  - any character, indicates verbose output \n\n";
      exit(1);
   }
  
  // This little trick lets us print to std::cout only if
  // a (dummy) command-line argument is provided.
  int iprint     = argc - 1;
  Teuchos::RCP<std::ostream> outStream;
  Teuchos::oblackholestream bhs; // outputs nothing
  if (iprint > 12)
    outStream = Teuchos::rcp(&std::cout, false);
  else
    outStream = Teuchos::rcp(&bhs, false);
  
  // Save the format state of the original std::cout.
  Teuchos::oblackholestream oldFormatState;
  oldFormatState.copyfmt(std::cout);
  
  *outStream \
    << "===============================================================================\n" \
    << "|                                                                             |\n" \
    << "|  Example: Generate Mass and Stiffness Matrices and Right-Hand Side Vector   |\n"
    << "|    for Div-Curl System on Hexahedral Mesh with Curl-Conforming Elements     |\n" \
    << "|                                                                             |\n" \
    << "|  Questions? Contact  Pavel Bochev  ([email protected]),                    |\n" \
    << "|                      Denis Ridzal  ([email protected]),                    |\n" \
    << "|                      Kara Peterson ([email protected]).                    |\n" \
    << "|                                                                             |\n" \
    << "|  Intrepid's website: http://trilinos.sandia.gov/packages/intrepid           |\n" \
    << "|  Trilinos website:   http://trilinos.sandia.gov                             |\n" \
    << "|                                                                             |\n" \
    << "===============================================================================\n";


// ************************************ GET INPUTS **************************************

  /* In the implementation for discontinuous material properties only the boundaries for
     region 1, associated with mu1, are input. The remainder of the grid is assumed to use mu2.
     Note that the material properties are assigned using the undeformed grid. */

    int NX            = atoi(argv[1]);  // num intervals in x direction (assumed box domain, -1,1)
    int NY            = atoi(argv[2]);  // num intervals in y direction (assumed box domain, -1,1)
    int NZ            = atoi(argv[3]);  // num intervals in z direction (assumed box domain, -1,1)
    int randomMesh    = atoi(argv[4]);  // 1 if mesh randomizer is to be used 0 if not
    double mu1        = atof(argv[5]);  // material property value for region 1
    double mu2        = atof(argv[6]);  // material property value for region 2
    double mu1LeftX   = atof(argv[7]);  // left X boundary for region 1
    double mu1RightX  = atof(argv[8]);  // right X boundary for region 1
    double mu1LeftY   = atof(argv[9]);  // left Y boundary for region 1
    double mu1RightY  = atof(argv[10]); // right Y boundary for region 1
    double mu1LeftZ   = atof(argv[11]); // left Z boundary for region 1
    double mu1RightZ  = atof(argv[12]); // right Z boundary for region 1

// *********************************** CELL TOPOLOGY **********************************

   // Get cell topology for base hexahedron
    typedef shards::CellTopology    CellTopology;
    CellTopology hex_8(shards::getCellTopologyData<shards::Hexahedron<8> >() );

   // Get dimensions 
    int numNodesPerElem = hex_8.getNodeCount();
    int numEdgesPerElem = hex_8.getEdgeCount();
    int numFacesPerElem = hex_8.getSideCount();
    int numNodesPerEdge = 2;
    int numNodesPerFace = 4;
    int numEdgesPerFace = 4;
    int spaceDim = hex_8.getDimension();

   // Build reference element edge to node map
    FieldContainer<int> refEdgeToNode(numEdgesPerElem,numNodesPerEdge);
    for (int i=0; i<numEdgesPerElem; i++){
        refEdgeToNode(i,0)=hex_8.getNodeMap(1, i, 0);
        refEdgeToNode(i,1)=hex_8.getNodeMap(1, i, 1);
    }

// *********************************** GENERATE MESH ************************************

    *outStream << "Generating mesh ... \n\n";

    *outStream << "    NX" << "   NY" << "   NZ\n";
    *outStream << std::setw(5) << NX <<
                 std::setw(5) << NY <<
                 std::setw(5) << NZ << "\n\n";

   // Print mesh information  
    int numElems = NX*NY*NZ;
    int numNodes = (NX+1)*(NY+1)*(NZ+1);
    int numEdges = (NX)*(NY + 1)*(NZ + 1) + (NX + 1)*(NY)*(NZ + 1) + (NX + 1)*(NY + 1)*(NZ);
    int numFaces = (NX)*(NY)*(NZ + 1) + (NX)*(NY + 1)*(NZ) + (NX + 1)*(NY)*(NZ);
    *outStream << " Number of Elements: " << numElems << " \n";
    *outStream << "    Number of Nodes: " << numNodes << " \n";
    *outStream << "    Number of Edges: " << numEdges << " \n";
    *outStream << "    Number of Faces: " << numFaces << " \n\n";

   // Cube
    double leftX = -1.0, rightX = 1.0;
    double leftY = -1.0, rightY = 1.0;
    double leftZ = -1.0, rightZ = 1.0;

   // Mesh spacing
    double hx = (rightX-leftX)/((double)NX);
    double hy = (rightY-leftY)/((double)NY);
    double hz = (rightZ-leftZ)/((double)NZ);

    // Get nodal coordinates
    FieldContainer<double> nodeCoord(numNodes, spaceDim);
    FieldContainer<int> nodeOnBoundary(numNodes);
    int inode = 0;
    for (int k=0; k<NZ+1; k++) {
      for (int j=0; j<NY+1; j++) {
        for (int i=0; i<NX+1; i++) {
          nodeCoord(inode,0) = leftX + (double)i*hx;
          nodeCoord(inode,1) = leftY + (double)j*hy;
          nodeCoord(inode,2) = leftZ + (double)k*hz;
          if (k==0 || j==0 || i==0 || k==NZ || j==NY || i==NX){
             nodeOnBoundary(inode)=1; 
          }
          inode++;
        }
      }
    }
    
   // Element to Node map
    FieldContainer<int> elemToNode(numElems, numNodesPerElem);
    int ielem = 0;
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          elemToNode(ielem,0) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i;
          elemToNode(ielem,1) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i + 1;
          elemToNode(ielem,2) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,3) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i;
          elemToNode(ielem,4) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i;
          elemToNode(ielem,5) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i + 1;
          elemToNode(ielem,6) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,7) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i;
          ielem++;
        }
      }
    }

  // Get edge connectivity
    FieldContainer<int> edgeToNode(numEdges, numNodesPerEdge);
    FieldContainer<int> elemToEdge(numElems, numEdgesPerElem);
    int iedge = 0;
    inode = 0;
    for (int k=0; k<NZ+1; k++) {
      for (int j=0; j<NY+1; j++) {
        for (int i=0; i<NX+1; i++) {
           if (i < NX){
               edgeToNode(iedge,0) = inode;
               edgeToNode(iedge,1) = inode + 1;
               if (j < NY && k < NZ){
                  ielem=i+j*NX+k*NX*NY;
                  elemToEdge(ielem,0) = iedge;
                  if (j > 0)
                     elemToEdge(ielem-NX,2) = iedge; 
                  if (k > 0)
                     elemToEdge(ielem-NX*NY,4) = iedge; 
                  if (j > 0 && k > 0)
                     elemToEdge(ielem-NX*NY-NX,6) = iedge; 
                }
               else if (j == NY && k == NZ){
                  ielem=i+(NY-1)*NX+(NZ-1)*NX*NY;
                  elemToEdge(ielem,6) = iedge;
                }
               else if (k == NZ && j < NY){
                  ielem=i+j*NX+(NZ-1)*NX*NY;
                  elemToEdge(ielem,4) = iedge;
                  if (j > 0)
                    elemToEdge(ielem-NX,6) = iedge;
                }
               else if (k < NZ && j == NY){
                  ielem=i+(NY-1)*NX+k*NX*NY;
                  elemToEdge(ielem,2) = iedge;
                  if (k > 0)
                     elemToEdge(ielem-NX*NY,6) = iedge;
                }
               iedge++;
            }
           if (j < NY){
               edgeToNode(iedge,0) = inode;
               edgeToNode(iedge,1) = inode + NX+1;
               if (i < NX && k < NZ){
                  ielem=i+j*NX+k*NX*NY;
                  elemToEdge(ielem,3) = iedge;
                  if (i > 0)
                     elemToEdge(ielem-1,1) = iedge; 
                  if (k > 0)
                     elemToEdge(ielem-NX*NY,7) = iedge; 
                  if (i > 0 && k > 0)
                     elemToEdge(ielem-NX*NY-1,5) = iedge; 
                }
               else if (i == NX && k == NZ){
                  ielem=NX-1+j*NX+(NZ-1)*NX*NY;
                  elemToEdge(ielem,5) = iedge;
                }
               else if (k == NZ && i < NX){
                  ielem=i+j*NX+(NZ-1)*NX*NY;
                  elemToEdge(ielem,7) = iedge;
                  if (i > 0)
                    elemToEdge(ielem-1,5) = iedge;
                }
               else if (k < NZ && i == NX){
                  ielem=NX-1+j*NX+k*NX*NY;
                  elemToEdge(ielem,1) = iedge;
                  if (k > 0)
                     elemToEdge(ielem-NX*NY,5) = iedge;
                }
               iedge++;
            }
           if (k < NZ){
               edgeToNode(iedge,0) = inode;
               edgeToNode(iedge,1) = inode + (NX+1)*(NY+1);
               if (i < NX && j < NY){
                  ielem=i+j*NX+k*NX*NY;
                  elemToEdge(ielem,8) = iedge;
                  if (i > 0)
                     elemToEdge(ielem-1,9) = iedge; 
                  if (j > 0)
                     elemToEdge(ielem-NX,11) = iedge; 
                  if (i > 0 && j > 0)
                     elemToEdge(ielem-NX-1,10) = iedge; 
                }
               else if (i == NX && j == NY){
                  ielem=NX-1+(NY-1)*NX+k*NX*NY;
                  elemToEdge(ielem,10) = iedge;
                }
               else if (j == NY && i < NX){
                  ielem=i+(NY-1)*NX+k*NX*NY;
                  elemToEdge(ielem,11) = iedge;
                  if (i > 0)
                    elemToEdge(ielem-1,10) = iedge;
                }
               else if (j < NY && i == NX){
                  ielem=NX-1+j*NX+k*NX*NY;
                  elemToEdge(ielem,9) = iedge;
                  if (j > 0)
                     elemToEdge(ielem-NX,10) = iedge;
                }
               iedge++;
            }
            inode++;
         }
      }
   }

   // Find boundary edges  
    FieldContainer<int> edgeOnBoundary(numEdges);
    for (int i=0; i<numEdges; i++){
       if (nodeOnBoundary(edgeToNode(i,0)) && nodeOnBoundary(edgeToNode(i,1))){
           edgeOnBoundary(i)=1;
       }
    }

   // Get face connectivity
    FieldContainer<int> faceToNode(numFaces, numNodesPerFace);
    FieldContainer<int> elemToFace(numElems, numFacesPerElem);
    FieldContainer<int> faceToEdge(numFaces, numEdgesPerFace);
    int iface = 0;
    inode = 0;
    for (int k=0; k<NZ+1; k++) {
      for (int j=0; j<NY+1; j++) {
        for (int i=0; i<NX+1; i++) {
           if (i < NX && k < NZ) {
              faceToNode(iface,0)=inode;
              faceToNode(iface,1)=inode + 1;
              faceToNode(iface,2)=inode + (NX+1)*(NY+1)+1;
              faceToNode(iface,3)=inode + (NX+1)*(NY+1);
              if (j < NY) {
                 ielem=i+j*NX+k*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,0);
                 faceToEdge(iface,1)=elemToEdge(ielem,9);
                 faceToEdge(iface,2)=elemToEdge(ielem,4);
                 faceToEdge(iface,3)=elemToEdge(ielem,8);
                 elemToFace(ielem,0)=iface;
                 if (j > 0) {
                    elemToFace(ielem-NX,2)=iface;
                 }
              }
              else if (j == NY) {
                 ielem=i+(NY-1)*NX+k*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,2);
                 faceToEdge(iface,1)=elemToEdge(ielem,10);
                 faceToEdge(iface,2)=elemToEdge(ielem,6);
                 faceToEdge(iface,3)=elemToEdge(ielem,11);
                 elemToFace(ielem,2)=iface;
              }
              iface++;
           }
           if (j < NY && k < NZ) {
              faceToNode(iface,0)=inode;
              faceToNode(iface,1)=inode + NX+1;
              faceToNode(iface,2)=inode + (NX+1)*(NY+1) + NX+1;
              faceToNode(iface,3)=inode + (NX+1)*(NY+1);
              if (i < NX) {
                 ielem=i+j*NX+k*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,3);
                 faceToEdge(iface,1)=elemToEdge(ielem,11);
                 faceToEdge(iface,2)=elemToEdge(ielem,7);
                 faceToEdge(iface,3)=elemToEdge(ielem,8);
                 elemToFace(ielem,3)=iface;
                 if (i > 0) {
                    elemToFace(ielem-1,1)=iface;
                 }
              }
              else if (i == NX) {
                 ielem=NX-1+j*NX+k*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,1);
                 faceToEdge(iface,1)=elemToEdge(ielem,10);
                 faceToEdge(iface,2)=elemToEdge(ielem,5);
                 faceToEdge(iface,3)=elemToEdge(ielem,9);
                 elemToFace(ielem,1)=iface;
              }
              iface++;
           }
           if (i < NX && j < NY) {
              faceToNode(iface,0)=inode;
              faceToNode(iface,1)=inode + 1;
              faceToNode(iface,2)=inode + NX+2;
              faceToNode(iface,3)=inode + NX+1;
              if (k < NZ) {
                 ielem=i+j*NX+k*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,0);
                 faceToEdge(iface,1)=elemToEdge(ielem,1);
                 faceToEdge(iface,2)=elemToEdge(ielem,2);
                 faceToEdge(iface,3)=elemToEdge(ielem,3);
                 elemToFace(ielem,4)=iface;
                 if (k > 0) {
                    elemToFace(ielem-NX*NY,5)=iface;
                 }
              }
              else if (k == NZ) {
                 ielem=i+j*NX+(NZ-1)*NX*NY;
                 faceToEdge(iface,0)=elemToEdge(ielem,4);
                 faceToEdge(iface,1)=elemToEdge(ielem,5);
                 faceToEdge(iface,2)=elemToEdge(ielem,6);
                 faceToEdge(iface,3)=elemToEdge(ielem,7);
                 elemToFace(ielem,5)=iface;
              }
              iface++;
           }
          inode++;
         }
      }
   }

   // Find boundary faces  
    FieldContainer<int> faceOnBoundary(numFaces);
    for (int i=0; i<numFaces; i++){
       if (nodeOnBoundary(faceToNode(i,0)) && nodeOnBoundary(faceToNode(i,1))
          && nodeOnBoundary(faceToNode(i,2)) && nodeOnBoundary(faceToNode(i,3))){
           faceOnBoundary(i)=1;
       }
    }
        
 
#define DUMP_DATA
#ifdef DUMP_DATA
   // Output connectivity
    ofstream fe2nout("elem2node.dat");
    ofstream fe2eout("elem2edge.dat");
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          int ielem = i + j * NX + k * NX * NY;
          for (int m=0; m<numNodesPerElem; m++){
              fe2nout << elemToNode(ielem,m) <<"  ";
           }
          fe2nout <<"\n";
          for (int l=0; l<numEdgesPerElem; l++) {
             fe2eout << elemToEdge(ielem,l) << "  ";
          }
          fe2eout << "\n";
        }
      }
    }
    fe2nout.close();
    fe2eout.close();
#endif

#ifdef DUMP_DATA_EXTRA
    ofstream fed2nout("edge2node.dat");
    for (int i=0; i<numEdges; i++) {
       fed2nout << edgeToNode(i,0) <<" ";
       fed2nout << edgeToNode(i,1) <<"\n";
    }
    fed2nout.close();

    ofstream fBnodeout("nodeOnBndy.dat");
    ofstream fBedgeout("edgeOnBndy.dat");
    for (int i=0; i<numNodes; i++) {
        fBnodeout << nodeOnBoundary(i) <<"\n";
    }
    for (int i=0; i<numEdges; i++) {
        fBedgeout << edgeOnBoundary(i) <<"\n";
    }
    fBnodeout.close();
    fBedgeout.close();
#endif

   // Set material properties using undeformed grid assuming each element has only one value of mu
    FieldContainer<double> muVal(numElems);
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          int ielem = i + j * NX + k * NX * NY;
          double midElemX = nodeCoord(elemToNode(ielem,0),0) + hx/2.0;
          double midElemY = nodeCoord(elemToNode(ielem,0),1) + hy/2.0;
          double midElemZ = nodeCoord(elemToNode(ielem,0),2) + hz/2.0;
          if ( (midElemX > mu1LeftX) && (midElemY > mu1LeftY) && (midElemZ > mu1LeftZ) &&
               (midElemX <= mu1RightX) && (midElemY <= mu1RightY) && (midElemZ <= mu1RightZ) ){
             muVal(ielem) = mu1;
          }
           else {
             muVal(ielem) = mu2;
          }
        }
      }
    }

   // Perturb mesh coordinates (only interior nodes)
    if (randomMesh){
      for (int k=1; k<NZ; k++) {
        for (int j=1; j<NY; j++) {
          for (int i=1; i<NX; i++) {
            int inode = i + j * (NX + 1) + k * (NX + 1) * (NY + 1);
           // random numbers between -1.0 and 1.0
            double rx = 2.0 * (double)rand()/RAND_MAX - 1.0;
            double ry = 2.0 * (double)rand()/RAND_MAX - 1.0;
            double rz = 2.0 * (double)rand()/RAND_MAX - 1.0; 
           // limit variation to 1/4 edge length
            nodeCoord(inode,0) = nodeCoord(inode,0) + 0.125 * hx * rx;
            nodeCoord(inode,1) = nodeCoord(inode,1) + 0.125 * hy * ry;
            nodeCoord(inode,2) = nodeCoord(inode,2) + 0.125 * hz * rz;
          }
        }
      }
    }

#ifdef DUMP_DATA
   // Print nodal coords
    ofstream fcoordout("coords.dat");
    for (int i=0; i<numNodes; i++) {
       fcoordout << nodeCoord(i,0) <<" ";
       fcoordout << nodeCoord(i,1) <<" ";
       fcoordout << nodeCoord(i,2) <<"\n";
    }
    fcoordout.close();
#endif


// **************************** INCIDENCE MATRIX **************************************

   // Node to edge incidence matrix
    *outStream << "Building incidence matrix ... \n\n";

    Epetra_SerialComm Comm;
    Epetra_Map globalMapC(numEdges, 0, Comm);
    Epetra_Map globalMapG(numNodes, 0, Comm);
    Epetra_FECrsMatrix DGrad(Copy, globalMapC, globalMapG, 2);

    double vals[2];
    vals[0]=-1.0; vals[1]=1.0;
    for (int j=0; j<numEdges; j++){
        int rowNum = j;
        int colNum[2];
        colNum[0] = edgeToNode(j,0);
        colNum[1] = edgeToNode(j,1);
        DGrad.InsertGlobalValues(1, &rowNum, 2, colNum, vals);
    }


// ************************************ CUBATURE **************************************

   // Get numerical integration points and weights for element
    *outStream << "Getting cubature ... \n\n";

    DefaultCubatureFactory<double>  cubFactory;                                   
    int cubDegree = 2;
    Teuchos::RCP<Cubature<double> > hexCub = cubFactory.create(hex_8, cubDegree); 

    int cubDim       = hexCub->getDimension();
    int numCubPoints = hexCub->getNumPoints();

    FieldContainer<double> cubPoints(numCubPoints, cubDim);
    FieldContainer<double> cubWeights(numCubPoints);

    hexCub->getCubature(cubPoints, cubWeights);


   // Get numerical integration points and weights for hexahedron face
    //             (needed for rhs boundary term)

    // Define topology of the face parametrization domain as [-1,1]x[-1,1]
    CellTopology paramQuadFace(shards::getCellTopologyData<shards::Quadrilateral<4> >() );

    // Define cubature
    DefaultCubatureFactory<double>  cubFactoryFace;
    Teuchos::RCP<Cubature<double> > hexFaceCubature = cubFactoryFace.create(paramQuadFace, 3);
    int cubFaceDim    = hexFaceCubature -> getDimension();
    int numFacePoints = hexFaceCubature -> getNumPoints();

    // Define storage for cubature points and weights on [-1,1]x[-1,1]
    FieldContainer<double> paramGaussWeights(numFacePoints);
    FieldContainer<double> paramGaussPoints(numFacePoints,cubFaceDim);

    // Define storage for cubature points on workset faces
    hexFaceCubature -> getCubature(paramGaussPoints, paramGaussWeights);


// ************************************** BASIS ***************************************

   // Define basis 
    *outStream << "Getting basis ... \n\n";
    Basis_HCURL_HEX_I1_FEM<double, FieldContainer<double> > hexHCurlBasis;
    Basis_HGRAD_HEX_C1_FEM<double, FieldContainer<double> > hexHGradBasis;

    int numFieldsC = hexHCurlBasis.getCardinality();
    int numFieldsG = hexHGradBasis.getCardinality();

  // Evaluate basis at cubature points
     FieldContainer<double> hexGVals(numFieldsG, numCubPoints); 
     FieldContainer<double> hexCVals(numFieldsC, numCubPoints, spaceDim); 
     FieldContainer<double> hexCurls(numFieldsC, numCubPoints, spaceDim); 
     FieldContainer<double> worksetCVals(numFieldsC, numFacePoints, spaceDim);


     hexHCurlBasis.getValues(hexCVals, cubPoints, OPERATOR_VALUE);
     hexHCurlBasis.getValues(hexCurls, cubPoints, OPERATOR_CURL);
     hexHGradBasis.getValues(hexGVals, cubPoints, OPERATOR_VALUE);


// ******** LOOP OVER ELEMENTS TO CREATE LOCAL MASS and STIFFNESS MATRICES *************


    *outStream << "Building mass and stiffness matrices ... \n\n";

 // Settings and data structures for mass and stiffness matrices
    typedef CellTools<double>  CellTools;
    typedef FunctionSpaceTools fst;
    //typedef ArrayTools art;
    int numCells = 1; 

   // Containers for nodes and edge signs 
    FieldContainer<double> hexNodes(numCells, numNodesPerElem, spaceDim);
    FieldContainer<double> hexEdgeSigns(numCells, numFieldsC);
   // Containers for Jacobian
    FieldContainer<double> hexJacobian(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobInv(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobDet(numCells, numCubPoints);
   // Containers for element HGRAD mass matrix
    FieldContainer<double> massMatrixG(numCells, numFieldsG, numFieldsG);
    FieldContainer<double> weightedMeasure(numCells, numCubPoints);
    FieldContainer<double> weightedMeasureMuInv(numCells, numCubPoints);
    FieldContainer<double> hexGValsTransformed(numCells, numFieldsG, numCubPoints);
    FieldContainer<double> hexGValsTransformedWeighted(numCells, numFieldsG, numCubPoints);
   // Containers for element HCURL mass matrix
    FieldContainer<double> massMatrixC(numCells, numFieldsC, numFieldsC);
    FieldContainer<double> hexCValsTransformed(numCells, numFieldsC, numCubPoints, spaceDim);
    FieldContainer<double> hexCValsTransformedWeighted(numCells, numFieldsC, numCubPoints, spaceDim);
   // Containers for element HCURL stiffness matrix
    FieldContainer<double> stiffMatrixC(numCells, numFieldsC, numFieldsC);
    FieldContainer<double> weightedMeasureMu(numCells, numCubPoints);    
    FieldContainer<double> hexCurlsTransformed(numCells, numFieldsC, numCubPoints, spaceDim);
    FieldContainer<double> hexCurlsTransformedWeighted(numCells, numFieldsC, numCubPoints, spaceDim);
   // Containers for right hand side vectors
    FieldContainer<double> rhsDatag(numCells, numCubPoints, cubDim);
    FieldContainer<double> rhsDatah(numCells, numCubPoints, cubDim);
    FieldContainer<double> gC(numCells, numFieldsC);
    FieldContainer<double> hC(numCells, numFieldsC);
    FieldContainer<double> hCBoundary(numCells, numFieldsC);
    FieldContainer<double> refGaussPoints(numFacePoints,spaceDim);
    FieldContainer<double> worksetGaussPoints(numCells,numFacePoints,spaceDim);
    FieldContainer<double> worksetJacobians(numCells, numFacePoints, spaceDim, spaceDim);
    FieldContainer<double> worksetJacobInv(numCells, numFacePoints, spaceDim, spaceDim);
    FieldContainer<double> worksetFaceN(numCells, numFacePoints, spaceDim);
    FieldContainer<double> worksetFaceNweighted(numCells, numFacePoints, spaceDim);
    FieldContainer<double> worksetVFieldVals(numCells, numFacePoints, spaceDim);
    FieldContainer<double> worksetCValsTransformed(numCells, numFieldsC, numFacePoints, spaceDim);
    FieldContainer<double> divuFace(numCells, numFacePoints);
    FieldContainer<double> worksetFieldDotNormal(numCells, numFieldsC, numFacePoints);
   // Container for cubature points in physical space
    FieldContainer<double> physCubPoints(numCells,numCubPoints, cubDim);

    
   // Global arrays in Epetra format
    Epetra_FECrsMatrix MassG(Copy, globalMapG, numFieldsG);
    Epetra_FECrsMatrix MassC(Copy, globalMapC, numFieldsC);
    Epetra_FECrsMatrix StiffC(Copy, globalMapC, numFieldsC);
    Epetra_FEVector rhsC(globalMapC);

#ifdef DUMP_DATA
    ofstream fSignsout("edgeSigns.dat");
#endif

 // *** Element loop ***
    for (int k=0; k<numElems; k++) {

     // Physical cell coordinates
      for (int i=0; i<numNodesPerElem; i++) {
         hexNodes(0,i,0) = nodeCoord(elemToNode(k,i),0);
         hexNodes(0,i,1) = nodeCoord(elemToNode(k,i),1);
         hexNodes(0,i,2) = nodeCoord(elemToNode(k,i),2);
      }

     // Edge signs
      for (int j=0; j<numEdgesPerElem; j++) {
          if (elemToNode(k,refEdgeToNode(j,0))==edgeToNode(elemToEdge(k,j),0) &&
              elemToNode(k,refEdgeToNode(j,1))==edgeToNode(elemToEdge(k,j),1))
              hexEdgeSigns(0,j) = 1.0;
          else 
              hexEdgeSigns(0,j) = -1.0;
#ifdef DUMP_DATA
          fSignsout << hexEdgeSigns(0,j) << "  ";
#endif
      }
#ifdef DUMP_DATA
       fSignsout << "\n";
#endif

    // Compute cell Jacobians, their inverses and their determinants
       CellTools::setJacobian(hexJacobian, cubPoints, hexNodes, hex_8);
       CellTools::setJacobianInv(hexJacobInv, hexJacobian );
       CellTools::setJacobianDet(hexJacobDet, hexJacobian );

// ************************** Compute element HGrad mass matrices *******************************
  
     // transform to physical coordinates 
      fst::HGRADtransformVALUE<double>(hexGValsTransformed, hexGVals);
      
     // compute weighted measure
      fst::computeCellMeasure<double>(weightedMeasure, hexJacobDet, cubWeights);

      // combine mu value with weighted measure
      for (int nC = 0; nC < numCells; nC++){
        for (int nPt = 0; nPt < numCubPoints; nPt++){
          weightedMeasureMuInv(nC,nPt) = weightedMeasure(nC,nPt) / muVal(k);
        }
      }
      
     // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGValsTransformedWeighted,
                                   weightedMeasureMuInv, hexGValsTransformed);

     // integrate to compute element mass matrix
      fst::integrate<double>(massMatrixG,
                             hexGValsTransformed, hexGValsTransformedWeighted, COMP_CPP);

      // assemble into global matrix
    //  int err = 0;
      for (int row = 0; row < numFieldsG; row++){
        for (int col = 0; col < numFieldsG; col++){
            int rowIndex = elemToNode(k,row);
            int colIndex = elemToNode(k,col);
            double val = massMatrixG(0,row,col);
            MassG.InsertGlobalValues(1, &rowIndex, 1, &colIndex, &val);
         }
      }

// ************************** Compute element HCurl mass matrices *******************************

     // transform to physical coordinates 
      fst::HCURLtransformVALUE<double>(hexCValsTransformed, hexJacobInv, 
                                   hexCVals);

     // multiply by weighted measure
      fst::multiplyMeasure<double>(hexCValsTransformedWeighted,
                                   weightedMeasure, hexCValsTransformed);

     // integrate to compute element mass matrix
      fst::integrate<double>(massMatrixC,
                             hexCValsTransformed, hexCValsTransformedWeighted,
                             COMP_CPP);

     // apply edge signs
      fst::applyLeftFieldSigns<double>(massMatrixC, hexEdgeSigns);
      fst::applyRightFieldSigns<double>(massMatrixC, hexEdgeSigns);


     // assemble into global matrix
      //err = 0;
      for (int row = 0; row < numFieldsC; row++){
        for (int col = 0; col < numFieldsC; col++){
            int rowIndex = elemToEdge(k,row);
            int colIndex = elemToEdge(k,col);
            double val = massMatrixC(0,row,col);
            MassC.InsertGlobalValues(1, &rowIndex, 1, &colIndex, &val);
         }
      }

// ************************ Compute element HCurl stiffness matrices *****************************

      // transform to physical coordinates 
      fst::HCURLtransformCURL<double>(hexCurlsTransformed, hexJacobian, hexJacobDet, 
                                       hexCurls);

      // combine mu value with weighted measure
      for (int nC = 0; nC < numCells; nC++){
        for (int nPt = 0; nPt < numCubPoints; nPt++){
          weightedMeasureMu(nC,nPt) = weightedMeasure(nC,nPt) / muVal(k);
         }
      }

     // multiply by weighted measure
      fst::multiplyMeasure<double>(hexCurlsTransformedWeighted,
                                   weightedMeasureMu, hexCurlsTransformed);

     // integrate to compute element stiffness matrix
      fst::integrate<double>(stiffMatrixC,
                             hexCurlsTransformed, hexCurlsTransformedWeighted,
                             COMP_CPP);

     // apply edge signs
     fst::applyLeftFieldSigns<double>(stiffMatrixC, hexEdgeSigns);
     fst::applyRightFieldSigns<double>(stiffMatrixC, hexEdgeSigns);

     // assemble into global matrix
      //err = 0;
      for (int row = 0; row < numFieldsC; row++){
        for (int col = 0; col < numFieldsC; col++){
            int rowIndex = elemToEdge(k,row);
            int colIndex = elemToEdge(k,col);
            double val = stiffMatrixC(0,row,col);
            StiffC.InsertGlobalValues(1, &rowIndex, 1, &colIndex, &val);
         }
      }

// ******************************* Build right hand side ************************************

      // transform integration points to physical points
       FieldContainer<double> physCubPoints(numCells,numCubPoints, cubDim);
       CellTools::mapToPhysicalFrame(physCubPoints, cubPoints, hexNodes, hex_8);

      // evaluate right hand side functions at physical points
       FieldContainer<double> rhsDatag(numCells, numCubPoints, cubDim);
       FieldContainer<double> rhsDatah(numCells, numCubPoints, cubDim);
       for (int nPt = 0; nPt < numCubPoints; nPt++){

          double x = physCubPoints(0,nPt,0);
          double y = physCubPoints(0,nPt,1);
          double z = physCubPoints(0,nPt,2);
          double du1, du2, du3;

          evalCurlu(du1, du2, du3, x, y, z);
          rhsDatag(0,nPt,0) = du1;
          rhsDatag(0,nPt,1) = du2;
          rhsDatag(0,nPt,2) = du3;
         
          evalGradDivu(du1, du2, du3,  x, y, z);
          rhsDatah(0,nPt,0) = du1;
          rhsDatah(0,nPt,1) = du2;
          rhsDatah(0,nPt,2) = du3;
       }

     // integrate (g,curl w) term
      fst::integrate<double>(gC, rhsDatag, hexCurlsTransformedWeighted,
                             COMP_CPP);

     // integrate -(grad h, w)  
      fst::integrate<double>(hC, rhsDatah, hexCValsTransformedWeighted,
                             COMP_CPP);

     // apply signs
      fst::applyFieldSigns<double>(gC, hexEdgeSigns);
      fst::applyFieldSigns<double>(hC, hexEdgeSigns);


     // calculate boundary term, (h*w, n)_{\Gamma} 
      for (int i = 0; i < numFacesPerElem; i++){
        if (faceOnBoundary(elemToFace(k,i))){

         // Map Gauss points on quad to reference face: paramGaussPoints -> refGaussPoints
            CellTools::mapToReferenceSubcell(refGaussPoints,
                                   paramGaussPoints,
                                   2, i, hex_8);

         // Get basis values at points on reference cell
           hexHCurlBasis.getValues(worksetCVals, refGaussPoints, OPERATOR_VALUE);

         // Compute Jacobians at Gauss pts. on reference face for all parent cells
           CellTools::setJacobian(worksetJacobians,
                         refGaussPoints,
                         hexNodes, hex_8);
           CellTools::setJacobianInv(worksetJacobInv, worksetJacobians );

         // transform to physical coordinates
            fst::HCURLtransformVALUE<double>(worksetCValsTransformed, worksetJacobInv,
                                   worksetCVals);

         // Map Gauss points on quad from ref. face to face workset: refGaussPoints -> worksetGaussPoints
            CellTools::mapToPhysicalFrame(worksetGaussPoints,
                                refGaussPoints,
                                hexNodes, hex_8);

         // Compute face normals
            CellTools::getPhysicalFaceNormals(worksetFaceN,
                                              worksetJacobians,
                                              i, hex_8);

         // multiply with weights
            for(int nPt = 0; nPt < numFacePoints; nPt++){
                for (int dim = 0; dim < spaceDim; dim++){
                   worksetFaceNweighted(0,nPt,dim) = worksetFaceN(0,nPt,dim) * paramGaussWeights(nPt);
                } //dim
             } //nPt

            fst::dotMultiplyDataField<double>(worksetFieldDotNormal, worksetFaceNweighted, 
                                               worksetCValsTransformed);

         // Evaluate div u at face points
           for(int nPt = 0; nPt < numFacePoints; nPt++){

             double x = worksetGaussPoints(0, nPt, 0);
             double y = worksetGaussPoints(0, nPt, 1);
             double z = worksetGaussPoints(0, nPt, 2);

             divuFace(0,nPt)=evalDivu(x, y, z);
           }

          // Integrate
          fst::integrate<double>(hCBoundary, divuFace, worksetFieldDotNormal,
                             COMP_CPP);

          // apply signs
           fst::applyFieldSigns<double>(hCBoundary, hexEdgeSigns);

         // add into hC term
            for (int nF = 0; nF < numFieldsC; nF++){
                hC(0,nF) = hC(0,nF) - hCBoundary(0,nF);
            }

        } // if faceOnBoundary
      } // numFaces


    // assemble into global vector
     for (int row = 0; row < numFieldsC; row++){
           int rowIndex = elemToEdge(k,row);
           double val = gC(0,row)-hC(0,row);
           rhsC.SumIntoGlobalValues(1, &rowIndex, &val);
     }
 
     
 } // *** end element loop ***

#ifdef DUMP_DATA
   fSignsout.close();
#endif

  // Assemble over multiple processors, if necessary
   MassG.GlobalAssemble();  MassG.FillComplete();
   MassC.GlobalAssemble();  MassC.FillComplete();
   StiffC.GlobalAssemble(); StiffC.FillComplete();
   rhsC.GlobalAssemble();
   DGrad.GlobalAssemble(); DGrad.FillComplete(MassG.RowMap(),MassC.RowMap());


  // Build the inverse diagonal for MassG
   Epetra_CrsMatrix MassGinv(Copy,MassG.RowMap(),MassG.RowMap(),1);
   Epetra_Vector DiagG(MassG.RowMap());

   DiagG.PutScalar(1.0);
   MassG.Multiply(false,DiagG,DiagG);
   for(int i=0; i<DiagG.MyLength(); i++) {
     DiagG[i]=1.0/DiagG[i];
   }
   for(int i=0; i<DiagG.MyLength(); i++) {
     int CID=MassG.GCID(i);
     MassGinv.InsertGlobalValues(MassG.GRID(i),1,&(DiagG[i]),&CID);
   }
   MassGinv.FillComplete();

  // Set value to zero on diagonal that cooresponds to boundary node
   for(int i=0;i<numNodes;i++) {
     if (nodeOnBoundary(i)){
      double val=0.0;
      MassGinv.ReplaceGlobalValues(i,1,&val,&i);
     }
   }

    int numEntries;
    double *values;
    int *cols;
  
  // Adjust matrices and rhs due to boundary conditions
   for (int row = 0; row<numEdges; row++){
      MassC.ExtractMyRowView(row,numEntries,values,cols);
        for (int i=0; i<numEntries; i++){
           if (edgeOnBoundary(cols[i])) {
             values[i]=0;
          }
       }
      StiffC.ExtractMyRowView(row,numEntries,values,cols);
        for (int i=0; i<numEntries; i++){
           if (edgeOnBoundary(cols[i])) {
             values[i]=0;
          }
       }
    }
   for (int row = 0; row<numEdges; row++){
       if (edgeOnBoundary(row)) {
          int rowindex = row;
          StiffC.ExtractMyRowView(row,numEntries,values,cols);
          for (int i=0; i<numEntries; i++){
             values[i]=0;
          }
          MassC.ExtractMyRowView(row,numEntries,values,cols);
          for (int i=0; i<numEntries; i++){
             values[i]=0;
          }
         rhsC[0][row]=0;
         double val = 1.0;
         StiffC.ReplaceGlobalValues(1, &rowindex, 1, &rowindex, &val);
       }
    }


#ifdef DUMP_DATA
  // Dump matrices to disk
   EpetraExt::RowMatrixToMatlabFile("mag_m0inv_matrix.dat",MassGinv);
   EpetraExt::RowMatrixToMatlabFile("mag_m1_matrix.dat",MassC);
   EpetraExt::RowMatrixToMatlabFile("mag_k1_matrix.dat",StiffC);
   EpetraExt::RowMatrixToMatlabFile("mag_t0_matrix.dat",DGrad);
   EpetraExt::MultiVectorToMatrixMarketFile("mag_rhs1_vector.dat",rhsC,0,0,false);
   fSignsout.close();
#endif


   std::cout << "End Result: TEST PASSED\n";

 // reset format state of std::cout
 std::cout.copyfmt(oldFormatState);
 
 return 0;
}
Exemple #4
0
int main(int argc, char *argv[]) {
Kokkos::initialize();
    // Check number of arguments
    if (argc < 4) {
      std::cout <<"\n>>> ERROR: Invalid number of arguments.\n\n";
      std::cout <<"Usage:\n\n";
      std::cout <<"  ./Intrepid_example_Drivers_Example_03NL.exe NX NY NZ verbose\n\n";
      std::cout <<" where \n";
      std::cout <<"   int NX              - num intervals in x direction (assumed box domain, 0,1) \n";
      std::cout <<"   int NY              - num intervals in y direction (assumed box domain, 0,1) \n";
      std::cout <<"   int NZ              - num intervals in z direction (assumed box domain, 0,1) \n";
      std::cout <<"   verbose (optional)  - any character, indicates verbose output \n\n";
      exit(1);
    }
  
    // This little trick lets us print to std::cout only if
    // a (dummy) command-line argument is provided.
    int iprint     = argc - 1;
    Teuchos::RCP<std::ostream> outStream;
    Teuchos::oblackholestream bhs; // outputs nothing
    if (iprint > 3)
      outStream = Teuchos::rcp(&std::cout, false);
    else
      outStream = Teuchos::rcp(&bhs, false);
  
    // Save the format state of the original std::cout.
    Teuchos::oblackholestream oldFormatState;
    oldFormatState.copyfmt(std::cout);
  
    *outStream \
    << "===============================================================================\n" \
    << "|                                                                             |\n" \
    << "|  Example: Generate PDE Jacobian for a Nonlinear Reaction-Diffusion          |\n" \
    << "|                   Equation on Hexahedral Mesh                               |\n" \
    << "|                                                                             |\n" \
    << "|  Questions? Contact  Pavel Bochev  ([email protected]),                    |\n" \
    << "|                      Denis Ridzal  ([email protected]),                    |\n" \
    << "|                      Kara Peterson ([email protected]).                    |\n" \
    << "|                                                                             |\n" \
    << "|  Intrepid's website: http://trilinos.sandia.gov/packages/intrepid           |\n" \
    << "|  Trilinos website:   http://trilinos.sandia.gov                             |\n" \
    << "|                                                                             |\n" \
    << "===============================================================================\n";


    // ************************************ GET INPUTS **************************************

    int NX = atoi(argv[1]);  // num intervals in x direction (assumed box domain, 0,1)
    int NY = atoi(argv[2]);  // num intervals in y direction (assumed box domain, 0,1)
    int NZ = atoi(argv[3]);  // num intervals in z direction (assumed box domain, 0,1)

    // *********************************** CELL TOPOLOGY **********************************

    // Get cell topology for base hexahedron
    typedef shards::CellTopology CellTopology;
    CellTopology hex_8(shards::getCellTopologyData<shards::Hexahedron<8> >() );

    // Get dimensions 
    int numNodesPerElem = hex_8.getNodeCount();
    int spaceDim = hex_8.getDimension();

    // *********************************** GENERATE MESH ************************************

    *outStream << "Generating mesh ... \n\n";

    *outStream << "   NX" << "   NY" << "   NZ\n";
    *outStream << std::setw(5) << NX <<
                  std::setw(5) << NY <<
                  std::setw(5) << NZ << "\n\n";

    // Print mesh information
    int numElems = NX*NY*NZ;
    int numNodes = (NX+1)*(NY+1)*(NZ+1);
    *outStream << " Number of Elements: " << numElems << " \n";
    *outStream << "    Number of Nodes: " << numNodes << " \n\n";

    // Cube
    double leftX = 0.0, rightX = 1.0;
    double leftY = 0.0, rightY = 1.0;
    double leftZ = 0.0, rightZ = 1.0;

    // Mesh spacing
    double hx = (rightX-leftX)/((double)NX);
    double hy = (rightY-leftY)/((double)NY);
    double hz = (rightZ-leftZ)/((double)NZ);

    // Get nodal coordinates
    FieldContainer<double> nodeCoord(numNodes, spaceDim);
    FieldContainer<int> nodeOnBoundary(numNodes);
    int inode = 0;
    for (int k=0; k<NZ+1; k++) {
      for (int j=0; j<NY+1; j++) {
        for (int i=0; i<NX+1; i++) {
          nodeCoord(inode,0) = leftX + (double)i*hx;
          nodeCoord(inode,1) = leftY + (double)j*hy;
          nodeCoord(inode,2) = leftZ + (double)k*hz;
          if (k==0 || j==0 || i==0 || k==NZ || j==NY || i==NX){
             nodeOnBoundary(inode)=1;
          }
          else {
             nodeOnBoundary(inode)=0;
          }
          inode++;
        }
      }
    }

#ifdef DUMP_DATA
    // Print nodal coords
    ofstream fcoordout("coords.dat");
    for (int i=0; i<numNodes; i++) {
       fcoordout << nodeCoord(i,0) <<" ";
       fcoordout << nodeCoord(i,1) <<" ";
       fcoordout << nodeCoord(i,2) <<"\n";
    }
    fcoordout.close();
#endif


    // Element to Node map
    FieldContainer<int> elemToNode(numElems, numNodesPerElem);
    int ielem = 0;
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          elemToNode(ielem,0) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i;
          elemToNode(ielem,1) = (NY + 1)*(NX + 1)*k + (NX + 1)*j + i + 1;
          elemToNode(ielem,2) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,3) = (NY + 1)*(NX + 1)*k + (NX + 1)*(j + 1) + i;
          elemToNode(ielem,4) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i;
          elemToNode(ielem,5) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*j + i + 1;
          elemToNode(ielem,6) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i + 1;
          elemToNode(ielem,7) = (NY + 1)*(NX + 1)*(k + 1) + (NX + 1)*(j + 1) + i;
          ielem++;
        }
      }
    }
#ifdef DUMP_DATA
    // Output connectivity
    ofstream fe2nout("elem2node.dat");
    for (int k=0; k<NZ; k++) {
      for (int j=0; j<NY; j++) {
        for (int i=0; i<NX; i++) {
          int ielem = i + j * NX + k * NX * NY;
          for (int m=0; m<numNodesPerElem; m++){
              fe2nout << elemToNode(ielem,m) <<"  ";
           }
          fe2nout <<"\n";
        }
      }
    }
    fe2nout.close();
#endif


    // ************************************ CUBATURE **************************************

    *outStream << "Getting cubature ... \n\n";

    // Get numerical integration points and weights
    DefaultCubatureFactory<double>  cubFactory;                                   
    int cubDegree = 2;
    Teuchos::RCP<Cubature<double> > hexCub = cubFactory.create(hex_8, cubDegree); 

    int cubDim       = hexCub->getDimension();
    int numCubPoints = hexCub->getNumPoints();

    FieldContainer<double> cubPoints(numCubPoints, cubDim);
    FieldContainer<double> cubWeights(numCubPoints);

    hexCub->getCubature(cubPoints, cubWeights);


    // ************************************** BASIS ***************************************

    *outStream << "Getting basis ... \n\n";

    // Define basis 
    Basis_HGRAD_HEX_C1_FEM<double, FieldContainer<double> > hexHGradBasis;
    int numFieldsG = hexHGradBasis.getCardinality();
    FieldContainer<double> hexGVals(numFieldsG, numCubPoints); 
    FieldContainer<double> hexGrads(numFieldsG, numCubPoints, spaceDim); 

    // Evaluate basis values and gradients at cubature points
    hexHGradBasis.getValues(hexGVals, cubPoints, OPERATOR_VALUE);
    hexHGradBasis.getValues(hexGrads, cubPoints, OPERATOR_GRAD);


    // ******** FEM ASSEMBLY *************

    *outStream << "Building PDE Jacobian ... \n\n";

    // Settings and data structures for mass and stiffness matrices
    typedef CellTools<double>  CellTools;
    typedef FunctionSpaceTools fst;
    int numCells = BATCH_SIZE; 
    int numBatches = numElems/numCells; 

    // Container for nodes
    FieldContainer<double> hexNodes(numCells, numNodesPerElem, spaceDim);
    // Containers for Jacobian
    FieldContainer<double> hexJacobian(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobInv(numCells, numCubPoints, spaceDim, spaceDim);
    FieldContainer<double> hexJacobDet(numCells, numCubPoints);
    // Containers for HGRAD bases
    FieldContainer<double> localPDEjacobian(numCells, numFieldsG, numFieldsG);
    FieldContainer<double> weightedMeasure(numCells, numCubPoints);
    FieldContainer<double> hexGValsTransformed(numCells, numFieldsG, numCubPoints);
    FieldContainer<double> hexGValsTransformedWeighted(numCells, numFieldsG, numCubPoints);
    FieldContainer<double> hexGradsTransformed(numCells, numFieldsG, numCubPoints, spaceDim);
    FieldContainer<double> hexGradsTransformedWeighted(numCells, numFieldsG, numCubPoints, spaceDim);

    // Global arrays in Epetra format 
    Epetra_SerialComm Comm;
    Epetra_Map globalMapG(numNodes, 0, Comm);
    Epetra_FECrsMatrix StiffMatrix(Copy, globalMapG, 64);

    // Additional arrays used in analytic assembly
    FieldContainer<double> u_coeffs(numCells, numFieldsG);
    FieldContainer<double> u_FE_val(numCells, numCubPoints);
    FieldContainer<double> df_of_u(numCells, numCubPoints);
    FieldContainer<double> df_of_u_times_basis(numCells, numFieldsG, numCubPoints);


    // Additional arrays used in AD-based assembly.
    FieldContainer<FadType> u_coeffsAD(numCells, numFieldsG);  
    FieldContainer<FadType> u_FE_gradAD(numCells, numCubPoints, spaceDim);
    FieldContainer<FadType> u_FE_valAD(numCells, numCubPoints);
    FieldContainer<FadType> f_of_u_AD(numCells, numCubPoints);
    FieldContainer<FadType> cellResidualAD(numCells, numFieldsG);
    for (int c=0; c<numCells; c++) {
      for(int f=0; f<numFieldsG; f++) {
          u_coeffsAD(c,f) = FadType(numFieldsG, f, 1.3);
      }
    }

    Teuchos::Time timer_jac_analytic("Time to compute element PDE Jacobians analytically: ");
    Teuchos::Time timer_jac_fad     ("Time to compute element PDE Jacobians using AD:     ");
    Teuchos::Time timer_jac_insert  ("Time for global insert,  w/o graph: ");
    Teuchos::Time timer_jac_insert_g("Time for global insert,  w/  graph: ");
    Teuchos::Time timer_jac_ga      ("Time for GlobalAssemble, w/o graph: ");
    Teuchos::Time timer_jac_ga_g    ("Time for GlobalAssemble, w/  graph: ");
    Teuchos::Time timer_jac_fc      ("Time for FillComplete,   w/o graph: ");
    Teuchos::Time timer_jac_fc_g    ("Time for FillComplete,   w/  graph: ");




    // *** Analytic element loop ***
    for (int bi=0; bi<numBatches; bi++) {

      // Physical cell coordinates
      for (int ci=0; ci<numCells; ci++) {
        int k = bi*numCells+ci;
        for (int i=0; i<numNodesPerElem; i++) {
            hexNodes(ci,i,0) = nodeCoord(elemToNode(k,i),0);
            hexNodes(ci,i,1) = nodeCoord(elemToNode(k,i),1);
            hexNodes(ci,i,2) = nodeCoord(elemToNode(k,i),2);
        }
      }

      // Compute cell Jacobians, their inverses and their determinants
      CellTools::setJacobian(hexJacobian, cubPoints, hexNodes, hex_8);
      CellTools::setJacobianInv(hexJacobInv, hexJacobian );
      CellTools::setJacobianDet(hexJacobDet, hexJacobian );

      // ******************** COMPUTE ELEMENT HGrad STIFFNESS MATRICES WITHOUT AD *******************

      // transform to physical coordinates 
      fst::HGRADtransformGRAD<double>(hexGradsTransformed, hexJacobInv, hexGrads);
      
      // compute weighted measure
      fst::computeCellMeasure<double>(weightedMeasure, hexJacobDet, cubWeights);

      // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGradsTransformedWeighted,
                                   weightedMeasure, hexGradsTransformed);

      // u_coeffs equals the value of u_coeffsAD
      for(int i=0; i<numFieldsG; i++){
        u_coeffs(0,i) = u_coeffsAD(0,i).val();
      }

      timer_jac_analytic.start(); // START TIMER
      // integrate to account for linear stiffness term
      fst::integrate<double>(localPDEjacobian, hexGradsTransformed, hexGradsTransformedWeighted, INTREPID_INTEGRATE_COMP_ENGINE);

      // represent value of the current state (iterate) as a linear combination of the basis functions
      u_FE_val.initialize();
      fst::evaluate<double>(u_FE_val, u_coeffs, hexGValsTransformed);
     
      // evaluate derivative of the nonlinear term and multiply by basis function
      dfunc_u(df_of_u, u_FE_val);
      fst::scalarMultiplyDataField<double>(df_of_u_times_basis, df_of_u, hexGValsTransformed);

      // integrate to account for nonlinear reaction term
      fst::integrate<double>(localPDEjacobian, df_of_u_times_basis, hexGValsTransformedWeighted, INTREPID_INTEGRATE_COMP_ENGINE, true);
      timer_jac_analytic.stop(); // STOP TIMER

      // assemble into global matrix
      for (int ci=0; ci<numCells; ci++) {
        int k = bi*numCells+ci;
        std::vector<int> rowIndex(numFieldsG);
        std::vector<int> colIndex(numFieldsG);
        for (int row = 0; row < numFieldsG; row++){
          rowIndex[row] = elemToNode(k,row);
        }
        for (int col = 0; col < numFieldsG; col++){
          colIndex[col] = elemToNode(k,col);
        }
        // We can insert an entire matrix at a time, but we opt for rows only.
        //timer_jac_insert.start();
        //StiffMatrix.InsertGlobalValues(numFieldsG, &rowIndex[0], numFieldsG, &colIndex[0], &localPDEjacobian(ci,0,0));
        //timer_jac_insert.stop();
        for (int row = 0; row < numFieldsG; row++){
          timer_jac_insert.start();
          StiffMatrix.InsertGlobalValues(1, &rowIndex[row], numFieldsG, &colIndex[0], &localPDEjacobian(ci,row,0));
          timer_jac_insert.stop();
        }
      }

    } // *** end analytic element loop ***
     
    // Assemble global objects
    timer_jac_ga.start(); StiffMatrix.GlobalAssemble(); timer_jac_ga.stop();
    timer_jac_fc.start(); StiffMatrix.FillComplete(); timer_jac_fc.stop();




    // *** AD element loop ***

    Epetra_CrsGraph mgraph = StiffMatrix.Graph();
    Epetra_FECrsMatrix StiffMatrixViaAD(Copy, mgraph);

    for (int bi=0; bi<numBatches; bi++) {

      // ******************** COMPUTE ELEMENT HGrad STIFFNESS MATRICES AND RIGHT-HAND SIDE WITH AD ********************

      // Physical cell coordinates
      for (int ci=0; ci<numCells; ci++) {
        int k = bi*numCells+ci;
        for (int i=0; i<numNodesPerElem; i++) {
            hexNodes(ci,i,0) = nodeCoord(elemToNode(k,i),0);
            hexNodes(ci,i,1) = nodeCoord(elemToNode(k,i),1);
            hexNodes(ci,i,2) = nodeCoord(elemToNode(k,i),2);
        }
      }

      // Compute cell Jacobians, their inverses and their determinants
      CellTools::setJacobian(hexJacobian, cubPoints, hexNodes, hex_8);
      CellTools::setJacobianInv(hexJacobInv, hexJacobian );
      CellTools::setJacobianDet(hexJacobDet, hexJacobian );

      // transform to physical coordinates
      fst::HGRADtransformGRAD<double>(hexGradsTransformed, hexJacobInv, hexGrads);
    
      // compute weighted measure
      fst::computeCellMeasure<double>(weightedMeasure, hexJacobDet, cubWeights);
    
      // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGradsTransformedWeighted, weightedMeasure, hexGradsTransformed);

      // transform basis values to physical coordinates 
      fst::HGRADtransformVALUE<double>(hexGValsTransformed, hexGVals);

      // multiply values with weighted measure
      fst::multiplyMeasure<double>(hexGValsTransformedWeighted,
                                   weightedMeasure, hexGValsTransformed);

      timer_jac_fad.start(); // START TIMER
      // represent gradient of the current state (iterate) as a linear combination of the gradients of basis functions
      // use AD arrays !
      u_FE_gradAD.initialize();
      fst::evaluate<FadType>(u_FE_gradAD, u_coeffsAD, hexGradsTransformed);

      // represent value of the current state (iterate) as a linear combination of the basis functions
      // use AD arrays !
      u_FE_valAD.initialize();
      fst::evaluate<FadType>(u_FE_valAD, u_coeffsAD, hexGValsTransformed);
      // compute nonlinear term
      func_u(f_of_u_AD, u_FE_valAD);

      // integrate to compute element residual   
      fst::integrate<FadType>(cellResidualAD, u_FE_gradAD,  hexGradsTransformedWeighted, INTREPID_INTEGRATE_COMP_ENGINE);
      fst::integrate<FadType>(cellResidualAD, f_of_u_AD, hexGValsTransformedWeighted, INTREPID_INTEGRATE_COMP_ENGINE, true);
      timer_jac_fad.stop(); // STOP TIMER

      // assemble into global matrix
      for (int ci=0; ci<numCells; ci++) {
        int k = bi*numCells+ci;
        std::vector<int> rowIndex(numFieldsG);
        std::vector<int> colIndex(numFieldsG);
        for (int row = 0; row < numFieldsG; row++){
          rowIndex[row] = elemToNode(k,row);
        }
        for (int col = 0; col < numFieldsG; col++){
          colIndex[col] = elemToNode(k,col);
	}
        for (int row = 0; row < numFieldsG; row++){
	  timer_jac_insert_g.start();
          StiffMatrixViaAD.SumIntoGlobalValues(1, &rowIndex[row], numFieldsG, &colIndex[0], cellResidualAD(ci,row).dx());
          timer_jac_insert_g.stop();
        }
      }
 
    } // *** end AD element loop ***

    // Assemble global objects
    timer_jac_ga_g.start(); StiffMatrixViaAD.GlobalAssemble(); timer_jac_ga_g.stop();
    timer_jac_fc_g.start(); StiffMatrixViaAD.FillComplete(); timer_jac_fc_g.stop();



    /****** Output *******/

#ifdef DUMP_DATA
    // Dump matrices to disk
    EpetraExt::RowMatrixToMatlabFile("stiff_matrix.dat",StiffMatrix);
    EpetraExt::RowMatrixToMatlabFile("stiff_matrixAD.dat",StiffMatrixViaAD);
#endif

    // take the infinity norm of the difference between StiffMatrix and StiffMatrixViaAD to see that 
    // the two matrices are the same
    EpetraExt::MatrixMatrix::Add(StiffMatrix, false, 1.0, StiffMatrixViaAD, -1.0);
    double normMat = StiffMatrixViaAD.NormInf();
    *outStream << "Infinity norm of difference between stiffness matrices = " << normMat << "\n";


    *outStream << "\n\nNumber of global nonzeros: " << StiffMatrix.NumGlobalNonzeros() << "\n\n";

    *outStream << timer_jac_analytic.name() << " " << timer_jac_analytic.totalElapsedTime() << " sec\n";
    *outStream << timer_jac_fad.name()      << " " << timer_jac_fad.totalElapsedTime()      << " sec\n\n";
    *outStream << timer_jac_insert.name()   << " " << timer_jac_insert.totalElapsedTime()   << " sec\n";
    *outStream << timer_jac_insert_g.name() << " " << timer_jac_insert_g.totalElapsedTime() << " sec\n\n";
    *outStream << timer_jac_ga.name()       << " " << timer_jac_ga.totalElapsedTime()       << " sec\n";
    *outStream << timer_jac_ga_g.name()     << " " << timer_jac_ga_g.totalElapsedTime()     << " sec\n\n";
    *outStream << timer_jac_fc.name()       << " " << timer_jac_fc.totalElapsedTime()       << " sec\n";
    *outStream << timer_jac_fc_g.name()     << " " << timer_jac_fc_g.totalElapsedTime()     << " sec\n\n";

    if ((normMat < 1.0e4*INTREPID_TOL)) {
      std::cout << "End Result: TEST PASSED\n";
    }
    else {
      std::cout << "End Result: TEST FAILED\n";
    }
   
    // reset format state of std::cout
    std::cout.copyfmt(oldFormatState);
   Kokkos::finalize();
    return 0;
}