void panzer::ScatterResidual_Epetra<panzer::Traits::Hessian, TRAITS,LO,GO>::
evaluateFields(typename TRAITS::EvalData workset)
   std::vector<int> cLIDs, rLIDs;
   std::vector<double> jacRow;

   bool useColumnIndexer = colGlobalIndexer_!=Teuchos::null;

   // for convenience pull out some objects from workset
   std::string blockId = this->wda(workset).block_id;
   const std::vector<std::size_t> & localCellIds = this->wda(workset).cell_local_ids;

   Teuchos::RCP<Epetra_Vector> r = epetraContainer_->get_f(); 
   Teuchos::RCP<Epetra_CrsMatrix> Jac = epetraContainer_->get_A();

   const Teuchos::RCP<const panzer::UniqueGlobalIndexer<LO,GO> >&
     colGlobalIndexer = useColumnIndexer ? colGlobalIndexer_ : globalIndexer_;
   // NOTE: A reordering of these loops will likely improve performance
   //       The "getGIDFieldOffsets" may be expensive.  However the
   //       "getElementGIDs" can be cheaper. However the lookup for LIDs
   //       may be more expensive!

   // scatter operation for each cell in workset
   for(std::size_t worksetCellIndex=0;worksetCellIndex<localCellIds.size();++worksetCellIndex) {
      std::size_t cellLocalId = localCellIds[worksetCellIndex];

      rLIDs = globalIndexer_->getElementLIDs(cellLocalId); 
      cLIDs = colGlobalIndexer->getElementLIDs(cellLocalId);
      if (Teuchos::nonnull(workset.other)) {
        const std::size_t other_cellLocalId = workset.other->cell_local_ids[worksetCellIndex];
        const std::vector<int> other_cLIDs = colGlobalIndexer->getElementLIDs(other_cellLocalId);
        cLIDs.insert(cLIDs.end(), other_cLIDs.begin(), other_cLIDs.end());

      // loop over each field to be scattered
      for(std::size_t fieldIndex = 0; fieldIndex < scatterFields_.size(); fieldIndex++) {
         int fieldNum = fieldIds_[fieldIndex];
         const std::vector<int> & elmtOffset = globalIndexer_->getGIDFieldOffsets(blockId,fieldNum);

         // loop over the basis functions (currently they are nodes)
         for(std::size_t rowBasisNum = 0; rowBasisNum < elmtOffset.size(); rowBasisNum++) {
            const ScalarT scatterField = (scatterFields_[fieldIndex])(worksetCellIndex,rowBasisNum);
            int rowOffset = elmtOffset[rowBasisNum];
            int row = rLIDs[rowOffset];
            // loop over the sensitivity indices: all DOFs on a cell
            for(int sensIndex=0;sensIndex<scatterField.size();++sensIndex)
              jacRow[sensIndex] = scatterField.fastAccessDx(sensIndex).fastAccessDx(0);

               int err = Jac->SumIntoMyValues(
                 std::min(cLIDs.size(), static_cast<size_t>(scatterField.size())),
         } // end rowBasisNum
      } // end fieldIndex
void panzer::ScatterDirichletResidual_Tpetra<panzer::Traits::Jacobian, TRAITS,LO,GO,NodeT>::
evaluateFields(typename TRAITS::EvalData workset)
   std::vector<GO> GIDs;
   // for convenience pull out some objects from workset
   std::string blockId = workset.block_id;
   const std::vector<std::size_t> & localCellIds = workset.cell_local_ids;

   Teuchos::RCP<typename LOC::VectorType> r = tpetraContainer_->get_f(); 
   Teuchos::RCP<typename LOC::CrsMatrixType> Jac = tpetraContainer_->get_A();

   Teuchos::ArrayRCP<double> r_array = r->get1dViewNonConst();
   Teuchos::ArrayRCP<double> dc_array = dirichletCounter_->get1dViewNonConst();

   // NOTE: A reordering of these loops will likely improve performance
   //       The "getGIDFieldOffsets may be expensive.  However the
   //       "getElementGIDs" can be cheaper. However the lookup for LIDs
   //       may be more expensive!

   // scatter operation for each cell in workset
   for(std::size_t worksetCellIndex=0;worksetCellIndex<localCellIds.size();++worksetCellIndex) {
      std::size_t cellLocalId = localCellIds[worksetCellIndex];

      const std::vector<LO> & LIDs = globalIndexer_->getElementLIDs(cellLocalId); 

      // loop over each field to be scattered
      for(std::size_t fieldIndex = 0; fieldIndex < scatterFields_.size(); fieldIndex++) {
         int fieldNum = fieldIds_[fieldIndex];
         // this call "should" get the right ordering according to the Intrepid basis
         const std::pair<std::vector<int>,std::vector<int> > & indicePair 
               = globalIndexer_->getGIDFieldOffsets_closure(blockId,fieldNum, side_subcell_dim_, local_side_id_);
         const std::vector<int> & elmtOffset = indicePair.first;
         const std::vector<int> & basisIdMap = indicePair.second;
         // loop over basis functions
         for(std::size_t basis=0;basis<elmtOffset.size();basis++) {
            int offset = elmtOffset[basis];
            LO lid = LIDs[offset];
            if(lid<0) // not on this processor

            int basisId = basisIdMap[basis];

            if (checkApplyBC_)
              if (!applyBC_[fieldIndex](worksetCellIndex,basisId))

            // zero out matrix row
               std::size_t sz = Jac->getNumEntriesInLocalRow(lid);
               std::size_t numEntries = 0;
               Teuchos::Array<LO> rowIndices(sz);
               Teuchos::Array<double> rowValues(sz);

               // Jac->getLocalRowView(lid,numEntries,rowValues,rowIndices);

               for(std::size_t i=0;i<numEntries;i++)
                  rowValues[i] = 0.0;

            GO gid = GIDs[offset];
            const ScalarT scatterField = (scatterFields_[fieldIndex])(worksetCellIndex,basisId);
            r_array[lid] = scatterField.val();
            dc_array[lid] = 1.0; // mark row as dirichlet
            // loop over the sensitivity indices: all DOFs on a cell
            std::vector<double> jacRow(scatterField.size(),0.0);
            for(int sensIndex=0;sensIndex<scatterField.size();++sensIndex)
               jacRow[sensIndex] = scatterField.fastAccessDx(sensIndex);
            Jac->replaceGlobalValues(gid, GIDs, jacRow);
void panzer::ScatterResidual_BlockedEpetra<panzer::Traits::Jacobian, TRAITS,LO,GO>::
evaluateFields(typename TRAITS::EvalData workset)
   using Teuchos::RCP;
   using Teuchos::ArrayRCP;
   using Teuchos::ptrFromRef;
   using Teuchos::rcp_dynamic_cast;

   using Thyra::VectorBase;
   using Thyra::SpmdVectorBase;
   using Thyra::ProductVectorBase;
   using Thyra::BlockedLinearOpBase;

   typedef BlockedEpetraLinearObjContainer BLOC;

   std::vector<std::pair<int,GO> > GIDs;
   std::vector<LO> LIDs;
   std::vector<double> jacRow;

   // for convenience pull out some objects from workset
   std::string blockId = this->wda(workset).block_id;
   const std::vector<std::size_t> & localCellIds = this->wda(workset).cell_local_ids;

   RCP<const BLOC> blockedContainer = blockedContainer_;

   RCP<ProductVectorBase<double> > r = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer->get_f());
   Teuchos::RCP<BlockedLinearOpBase<double> > Jac = rcp_dynamic_cast<BlockedLinearOpBase<double> >(blockedContainer->get_A());

   int numFieldBlocks = globalIndexer_->getNumFieldBlocks();
   std::vector<int> blockOffsets(numFieldBlocks+1); // number of fields, plus a sentinnel
   for(int blk=0;blk<numFieldBlocks;blk++) {
      int blockOffset = globalIndexer_->getBlockGIDOffset(blockId,blk);
      blockOffsets[blk] = blockOffset;

   std::unordered_map<std::pair<int,int>,Teuchos::RCP<Epetra_CrsMatrix>,panzer::pair_hash> jacEpetraBlocks;

   // NOTE: A reordering of these loops will likely improve performance
   //       The "getGIDFieldOffsets" may be expensive.  However the
   //       "getElementGIDs" can be cheaper. However the lookup for LIDs
   //       may be more expensive!

   // scatter operation for each cell in workset
   for(std::size_t worksetCellIndex=0;worksetCellIndex<localCellIds.size();++worksetCellIndex) {
      std::size_t cellLocalId = localCellIds[worksetCellIndex];


      // caculate the local IDs for this element
      for(std::size_t i=0;i<GIDs.size();i++) {
         // used for doing local ID lookups
         RCP<const Epetra_Map> r_map = blockedContainer->getMapForBlock(GIDs[i].first);

         LIDs[i] = r_map->LID(GIDs[i].second);

      // loop over each field to be scattered
      Teuchos::ArrayRCP<double> local_r;
      for(std::size_t fieldIndex = 0; fieldIndex < scatterFields_.size(); fieldIndex++) {
         int fieldNum = fieldIds_[fieldIndex];
         int blockRowIndex = globalIndexer_->getFieldBlock(fieldNum);

         // grab local data for inputing
         if(r!=Teuchos::null) {
            RCP<SpmdVectorBase<double> > block_r = rcp_dynamic_cast<SpmdVectorBase<double> >(r->getNonconstVectorBlock(blockRowIndex));

         const std::vector<int> & elmtOffset = globalIndexer_->getGIDFieldOffsets(blockId,fieldNum);
         // loop over the basis functions (currently they are nodes)
         for(std::size_t rowBasisNum = 0; rowBasisNum < elmtOffset.size(); rowBasisNum++) {
            const ScalarT scatterField = (scatterFields_[fieldIndex])(worksetCellIndex,rowBasisNum);
            int rowOffset = elmtOffset[rowBasisNum];
            int r_lid = LIDs[rowOffset];
            // Sum residual
               local_r[r_lid] += (scatterField.val());

            blockOffsets[numFieldBlocks] = scatterField.size(); // add the sentinel
            // loop over the sensitivity indices: all DOFs on a cell
            // For Neumann conditions with no dependence on degrees of freedom, there should be no Jacobian contribution
            if(scatterField.size() == 0)
            for(int sensIndex=0;sensIndex<scatterField.size();++sensIndex) {
               jacRow[sensIndex] = scatterField.fastAccessDx(sensIndex);
            for(int blockColIndex=0;blockColIndex<numFieldBlocks;blockColIndex++) {
               int start = blockOffsets[blockColIndex];
               int end = blockOffsets[blockColIndex+1];


               // check hash table for jacobian sub block
               std::pair<int,int> blockIndex = std::make_pair(blockRowIndex,blockColIndex);
               Teuchos::RCP<Epetra_CrsMatrix> subJac = jacEpetraBlocks[blockIndex];

               // if you didn't find one before, add it to the hash table
               if(subJac==Teuchos::null) {
                  Teuchos::RCP<Thyra::LinearOpBase<double> > tOp = Jac->getNonconstBlock(blockIndex.first,blockIndex.second); 

                  // block operator is null, don't do anything (it is excluded)

                  Teuchos::RCP<Epetra_Operator> eOp = Thyra::get_Epetra_Operator(*tOp);
                  subJac = rcp_dynamic_cast<Epetra_CrsMatrix>(eOp,true);
                  jacEpetraBlocks[blockIndex] = subJac;

               // Sum Jacobian
               int err = subJac->SumIntoMyValues(r_lid, end-start, &jacRow[start],&LIDs[start]);
               if(err!=0) {
                 RCP<const Epetra_Map> rr = blockedContainer->getMapForBlock(GIDs[start].first);
                 bool sameColMap = subJac->ColMap().SameAs(*rr);

                 std::stringstream ss;
                 ss << "Failed inserting row: " << GIDs[rowOffset].second << " (" << r_lid << "): ";
                 for(int i=start;i<end;i++)
                   ss << GIDs[i].second << " (" << LIDs[i] << ") ";
                 ss << std::endl;
                 ss << "Into block " << blockRowIndex << ", " << blockColIndex << std::endl;

                 ss << "scatter field = ";
                 ss << std::endl;

                 ss << "Same map = " << (sameColMap ? "true" : "false") << std::endl; 
         } // end rowBasisNum
      } // end fieldIndex
void panzer::ScatterDirichletResidual_BlockedTpetra<panzer::Traits::Jacobian, TRAITS,LO,GO,NodeT>::
evaluateFields(typename TRAITS::EvalData workset)
    using Teuchos::RCP;
    using Teuchos::ArrayRCP;
    using Teuchos::ptrFromRef;
    using Teuchos::rcp_dynamic_cast;

    using Thyra::VectorBase;
    using Thyra::SpmdVectorBase;
    using Thyra::ProductVectorBase;
    using Thyra::BlockedLinearOpBase;

    std::vector<std::pair<int,GO> > GIDs;
    std::vector<LO> LIDs;

    // for convenience pull out some objects from workset
    std::string blockId = this->wda(workset).block_id;
    const std::vector<std::size_t> & localCellIds = this->wda(workset).cell_local_ids;

    RCP<ProductVectorBase<double> > r = rcp_dynamic_cast<ProductVectorBase<double> >(blockedContainer_->get_f());
    Teuchos::RCP<BlockedLinearOpBase<double> > Jac = rcp_dynamic_cast<BlockedLinearOpBase<double> >(blockedContainer_->get_A());

    int numFieldBlocks = globalIndexer_->getNumFieldBlocks();
    std::vector<int> blockOffsets(numFieldBlocks+1); // number of fields, plus a sentinnel
    for(int blk=0; blk<numFieldBlocks; blk++) {
        int blockOffset = globalIndexer_->getBlockGIDOffset(blockId,blk);
        blockOffsets[blk] = blockOffset;

    std::unordered_map<std::pair<int,int>,Teuchos::RCP<CrsMatrixType>,panzer::pair_hash> jacTpetraBlocks;

    // NOTE: A reordering of these loops will likely improve performance
    //       The "getGIDFieldOffsets may be expensive.  However the
    //       "getElementGIDs" can be cheaper. However the lookup for LIDs
    //       may be more expensive!

    // scatter operation for each cell in workset
    for(std::size_t worksetCellIndex=0; worksetCellIndex<localCellIds.size(); ++worksetCellIndex) {
        std::size_t cellLocalId = localCellIds[worksetCellIndex];

        blockOffsets[numFieldBlocks] = GIDs.size();

        // caculate the local IDs for this element
        for(std::size_t i=0; i<GIDs.size(); i++) {
            // used for doing local ID lookups
            RCP<const MapType> r_map = blockedContainer_->getMapForBlock(GIDs[i].first);

            LIDs[i] = r_map->getLocalElement(GIDs[i].second);

        // loop over each field to be scattered
        Teuchos::ArrayRCP<double> local_r, local_dc;
        for(std::size_t fieldIndex = 0; fieldIndex < scatterFields_.size(); fieldIndex++) {
            int fieldNum = fieldIds_[fieldIndex];
            int blockRowIndex = globalIndexer_->getFieldBlock(fieldNum);

            RCP<SpmdVectorBase<double> > dc = rcp_dynamic_cast<SpmdVectorBase<double> >(dirichletCounter_->getNonconstVectorBlock(blockRowIndex));

            // grab local data for inputing
            RCP<SpmdVectorBase<double> > block_r = rcp_dynamic_cast<SpmdVectorBase<double> >(r->getNonconstVectorBlock(blockRowIndex));

            // this call "should" get the right ordering according to the Intrepid basis
            const std::pair<std::vector<int>,std::vector<int> > & indicePair
                = globalIndexer_->getGIDFieldOffsets_closure(blockId,fieldNum, side_subcell_dim_, local_side_id_);
            const std::vector<int> & elmtOffset = indicePair.first;
            const std::vector<int> & basisIdMap = indicePair.second;

            // loop over basis functions
            for(std::size_t basis=0; basis<elmtOffset.size(); basis++) {
                int offset = elmtOffset[basis];
                int lid = LIDs[offset];
                if(lid<0) // not on this processor

                int basisId = basisIdMap[basis];

                if (checkApplyBC_)
                    if (!applyBC_[fieldIndex](worksetCellIndex,basisId))

                // zero out matrix row
                for(int blockColIndex=0; blockColIndex<numFieldBlocks; blockColIndex++) {
                    int start = blockOffsets[blockColIndex];
                    int end = blockOffsets[blockColIndex+1];


                    // check hash table for jacobian sub block
                    std::pair<int,int> blockIndex = std::make_pair(blockRowIndex,blockColIndex);
                    Teuchos::RCP<CrsMatrixType> subJac = jacTpetraBlocks[blockIndex];

                    // if you didn't find one before, add it to the hash table
                    if(subJac==Teuchos::null) {
                        Teuchos::RCP<Thyra::LinearOpBase<double> > tOp = Jac->getNonconstBlock(blockIndex.first,blockIndex.second);

                        // block operator is null, don't do anything (it is excluded)

                        Teuchos::RCP<OperatorType> tpetra_Op = rcp_dynamic_cast<ThyraLinearOp>(tOp)->getTpetraOperator();
                        subJac = rcp_dynamic_cast<CrsMatrixType>(tpetra_Op,true);
                        jacTpetraBlocks[blockIndex] = subJac;

                    std::size_t sz = subJac->getNumEntriesInLocalRow(lid);
                    std::size_t numEntries = 0;
                    Teuchos::Array<LO> rowIndices(sz);
                    Teuchos::Array<double> rowValues(sz);


                    for(std::size_t i=0; i<numEntries; i++)
                        rowValues[i] = 0.0;


                const ScalarT scatterField = (scatterFields_[fieldIndex])(worksetCellIndex,basisId);

                local_r[lid] = scatterField.val();
                local_dc[lid] = 1.0; // mark row as dirichlet

                // loop over the sensitivity indices: all DOFs on a cell
                std::vector<double> jacRow(scatterField.size(),0.0);

                for(int sensIndex=0; sensIndex<scatterField.size(); ++sensIndex)
                    jacRow[sensIndex] = scatterField.fastAccessDx(sensIndex);

                for(int blockColIndex=0; blockColIndex<numFieldBlocks; blockColIndex++) {
                    int start = blockOffsets[blockColIndex];
                    int end = blockOffsets[blockColIndex+1];


                    // check hash table for jacobian sub block
                    std::pair<int,int> blockIndex = std::make_pair(blockRowIndex,blockColIndex);
                    Teuchos::RCP<CrsMatrixType> subJac = jacTpetraBlocks[blockIndex];

                    // if you didn't find one before, add it to the hash table
                    if(subJac==Teuchos::null) {
                        Teuchos::RCP<Thyra::LinearOpBase<double> > tOp = Jac->getNonconstBlock(blockIndex.first,blockIndex.second);

                        // block operator is null, don't do anything (it is excluded)

                        Teuchos::RCP<OperatorType> tpetra_Op = rcp_dynamic_cast<ThyraLinearOp>(tOp)->getTpetraOperator();
                        subJac = rcp_dynamic_cast<CrsMatrixType>(tpetra_Op,true);
                        jacTpetraBlocks[blockIndex] = subJac;

                    // Sum Jacobian
                    subJac->replaceLocalValues(lid, Teuchos::arrayViewFromVector(LIDs).view(start,end-start),
void panzer::ScatterDirichletResidual_Tpetra<panzer::Traits::Tangent, TRAITS,LO,GO,NodeT>::
evaluateFields(typename TRAITS::EvalData workset)
   std::vector<GO> GIDs;
   std::vector<LO> LIDs;

   // for convenience pull out some objects from workset
   std::string blockId = this->wda(workset).block_id;
   const std::vector<std::size_t> & localCellIds = this->wda(workset).cell_local_ids;

   Teuchos::RCP<typename LOC::VectorType> r = (!scatterIC_) ?
     tpetraContainer_->get_f() :

   Teuchos::ArrayRCP<double> r_array = r->get1dViewNonConst();
   Teuchos::ArrayRCP<double> dc_array = dirichletCounter_->get1dViewNonConst();

   // NOTE: A reordering of these loops will likely improve performance
   //       The "getGIDFieldOffsets may be expensive.  However the
   //       "getElementGIDs" can be cheaper. However the lookup for LIDs
   //       may be more expensive!

   // scatter operation for each cell in workset
   for(std::size_t worksetCellIndex=0;worksetCellIndex<localCellIds.size();++worksetCellIndex) {
      std::size_t cellLocalId = localCellIds[worksetCellIndex];


      // caculate the local IDs for this element
      for(std::size_t i=0;i<GIDs.size();i++)
         LIDs[i] = r->getMap()->getLocalElement(GIDs[i]);

      // loop over each field to be scattered
      for(std::size_t fieldIndex = 0; fieldIndex < scatterFields_.size(); fieldIndex++) {
         int fieldNum = fieldIds_[fieldIndex];

         if (!scatterIC_) {
           // this call "should" get the right ordering according to the Intrepid2 basis
           const std::pair<std::vector<int>,std::vector<int> > & indicePair
             = globalIndexer_->getGIDFieldOffsets_closure(blockId,fieldNum, side_subcell_dim_, local_side_id_);
           const std::vector<int> & elmtOffset = indicePair.first;
           const std::vector<int> & basisIdMap = indicePair.second;

           // loop over basis functions
           for(std::size_t basis=0;basis<elmtOffset.size();basis++) {
             int offset = elmtOffset[basis];
             LO lid = LIDs[offset];
             if(lid<0) // not on this processor!

             int basisId = basisIdMap[basis];

             if (checkApplyBC_)
               if (!applyBC_[fieldIndex](worksetCellIndex,basisId))

             ScalarT value = (scatterFields_[fieldIndex])(worksetCellIndex,basisId);
             //r_array[lid] = (scatterFields_[fieldIndex])(worksetCellIndex,basisId).val();

             // then scatter the sensitivity vectors
               for(std::size_t d=0;d<dfdp_vectors_.size();d++)
                 dfdp_vectors_[d][lid] = 0.0;
               for(int d=0;d<value.size();d++) {
                 dfdp_vectors_[d][lid] = value.fastAccessDx(d);

             // record that you set a dirichlet condition
             dc_array[lid] = 1.0;
         } else {
           // this call "should" get the right ordering according to the Intrepid2 basis
           const std::vector<int> & elmtOffset = globalIndexer_->getGIDFieldOffsets(blockId,fieldNum);

           // loop over basis functions
           for(std::size_t basis=0;basis<elmtOffset.size();basis++) {
             int offset = elmtOffset[basis];
             LO lid = LIDs[offset];
             if(lid<0) // not on this processor!

             ScalarT value = (scatterFields_[fieldIndex])(worksetCellIndex,basis);
             //r_array[lid] = (scatterFields_[fieldIndex])(worksetCellIndex,basis).val();

             // then scatter the sensitivity vectors
               for(std::size_t d=0;d<dfdp_vectors_.size();d++)
                 dfdp_vectors_[d][lid] = 0.0;
               for(int d=0;d<value.size();d++) {
                 dfdp_vectors_[d][lid] = value.fastAccessDx(d);

             // record that you set a dirichlet condition
             dc_array[lid] = 1.0;