//--------------------------------------------------------------------------
//-------- execute ---------------------------------------------------------
//--------------------------------------------------------------------------
void
AssembleMomentumEdgeSolverAlgorithm::execute()
{

  stk::mesh::MetaData & meta_data = realm_.meta_data();

  const int nDim = meta_data.spatial_dimension();

  const double small = 1.0e-16;

  // extract user advection options (allow to potentially change over time)
  const std::string dofName = "velocity";
  const double alpha = realm_.get_alpha_factor(dofName);
  const double alphaUpw = realm_.get_alpha_upw_factor(dofName);
  const double hoUpwind = realm_.get_upw_factor(dofName);
  const bool useLimiter = realm_.primitive_uses_limiter(dofName);

  // one minus flavor
  const double om_alpha = 1.0-alpha;
  const double om_alphaUpw = 1.0-alphaUpw;

  // space for LHS/RHS; always edge connectivity
  const int nodesPerEdge = 2;
  const int lhsSize = nDim*nodesPerEdge*nDim*nodesPerEdge;
  const int rhsSize = nDim*nodesPerEdge;
  std::vector<double> lhs(lhsSize);
  std::vector<double> rhs(rhsSize);
  std::vector<stk::mesh::Entity> connected_nodes(2);

  // area vector; gather into
  std::vector<double> areaVec(nDim);

  // pointer for fast access
  double *p_lhs = &lhs[0];
  double *p_rhs = &rhs[0];
  double *p_areaVec = &areaVec[0];

  // space for dui/dxj. This variable is the modified gradient with NOC
  std::vector<double> duidxj(nDim*nDim);

  // extrapolated value from the L/R direction 
  std::vector<double> uIpL(nDim);
  std::vector<double> uIpR(nDim);
  // limiter values from the L/R direction, 0:1
  std::vector<double> limitL(nDim,1.0); 
  std::vector<double> limitR(nDim,1.0);
  // extrapolated gradient from L/R direction
  std::vector<double> duL(nDim);
  std::vector<double> duR(nDim);
  
  // pointers for fast access
  double *p_duidxj = &duidxj[0];
  double *p_uIpL = &uIpL[0];
  double *p_uIpR = &uIpR[0];
  double *p_limitL = &limitL[0];
  double *p_limitR = &limitR[0];
  double *p_duL = &duL[0];
  double *p_duR = &duR[0];

  // deal with state
  VectorFieldType &velocityNp1 = velocity_->field_of_state(stk::mesh::StateNP1);
  ScalarFieldType &densityNp1 = density_->field_of_state(stk::mesh::StateNP1);

  // define some common selectors
  stk::mesh::Selector s_locally_owned_union = meta_data.locally_owned_part()
    & stk::mesh::selectUnion(partVec_) 
    & !(realm_.get_inactive_selector());

  stk::mesh::BucketVector const& edge_buckets =
    realm_.get_buckets( stk::topology::EDGE_RANK, s_locally_owned_union );
  for ( stk::mesh::BucketVector::const_iterator ib = edge_buckets.begin();
        ib != edge_buckets.end() ; ++ib ) {
    stk::mesh::Bucket & b = **ib ;
    const stk::mesh::Bucket::size_type length   = b.size();

    // pointer to edge area vector and mdot
    const double * av = stk::mesh::field_data(*edgeAreaVec_, b);
    const double * mdot = stk::mesh::field_data(*massFlowRate_, b);

    for ( stk::mesh::Bucket::size_type k = 0 ; k < length ; ++k ) {

      // zeroing of lhs/rhs
      for ( int i = 0; i < lhsSize; ++i ) {
        p_lhs[i] = 0.0;
      }
      for ( int i = 0; i < rhsSize; ++i ) {
        p_rhs[i] = 0.0;
      }

      stk::mesh::Entity const * edge_node_rels = b.begin_nodes(k);

      // pointer to edge area vector
      for ( int j = 0; j < nDim; ++j )
        p_areaVec[j] = av[k*nDim+j];
      const double tmdot = mdot[k];

      // sanity check on number or nodes
      ThrowAssert( b.num_nodes(k) == 2 );

      // left and right nodes
      stk::mesh::Entity nodeL = edge_node_rels[0];
      stk::mesh::Entity nodeR = edge_node_rels[1];

      connected_nodes[0] = nodeL;
      connected_nodes[1] = nodeR;

      // extract nodal fields
      const double * coordL = stk::mesh::field_data(*coordinates_, nodeL);
      const double * coordR = stk::mesh::field_data(*coordinates_, nodeR);

      const double * dudxL = stk::mesh::field_data(*dudx_, nodeL);
      const double * dudxR = stk::mesh::field_data(*dudx_, nodeR);

      const double * vrtmL = stk::mesh::field_data(*velocityRTM_, nodeL);
      const double * vrtmR = stk::mesh::field_data(*velocityRTM_, nodeR);

      const double * uNp1L = stk::mesh::field_data(velocityNp1, nodeL);
      const double * uNp1R = stk::mesh::field_data(velocityNp1, nodeR);

      const double densityL = *stk::mesh::field_data(densityNp1, nodeL);
      const double densityR = *stk::mesh::field_data(densityNp1, nodeR);

      const double viscosityL = *stk::mesh::field_data(*viscosity_, nodeL);
      const double viscosityR = *stk::mesh::field_data(*viscosity_, nodeR);

      // copy in extrapolated values
      for ( int i = 0; i < nDim; ++i ) {
        // extrapolated du
        p_duL[i] = 0.0;
        p_duR[i] = 0.0;
        const int offSet = nDim*i;
        for ( int j = 0; j < nDim; ++j ) {
          const double dxj = 0.5*(coordR[j] - coordL[j]);
          p_duL[i] += dxj*dudxL[offSet+j];
          p_duR[i] += dxj*dudxR[offSet+j];
        }
      }

      // compute geometry
      double axdx = 0.0;
      double asq = 0.0;
      double udotx = 0.0;
      for ( int j = 0; j < nDim; ++j ) {
        const double axj = p_areaVec[j];
        const double dxj = coordR[j] - coordL[j];
        axdx += axj*dxj;
        asq += axj*axj;
        udotx += 0.5*dxj*(vrtmL[j] + vrtmR[j]);
      }

      const double inv_axdx = 1.0/axdx;

      // ip props
      const double viscIp = 0.5*(viscosityL + viscosityR);
      const double diffIp = 0.5*(viscosityL/densityL + viscosityR/densityR);

      // Peclet factor
      const double pecfac = pecletFunction_->execute(std::abs(udotx)/(diffIp+small));
      const double om_pecfac = 1.0-pecfac;

      // determine limiter if applicable
      if ( useLimiter ) {
        for ( int i = 0; i < nDim; ++i ) {
          const double dq = uNp1R[i] - uNp1L[i];
          const double dqMl = 2.0*2.0*p_duL[i] - dq;
          const double dqMr = 2.0*2.0*p_duR[i] - dq;
          p_limitL[i] = van_leer(dqMl, dq, small);
          p_limitR[i] = van_leer(dqMr, dq, small);
        }
      }

      // final upwind extrapolation; with limiter
      for ( int i = 0; i < nDim; ++i ) {
        p_uIpL[i] = uNp1L[i] + p_duL[i]*hoUpwind*p_limitL[i];
        p_uIpR[i] = uNp1R[i] - p_duR[i]*hoUpwind*p_limitR[i];
      }

      /*
        form duidxj with over-relaxed procedure of Jasak:

        dui/dxj = GjUi +[(uiR - uiL) - GlUi*dxl]*Aj/AxDx
        where Gp is the interpolated pth nodal gradient for ui
      */
      for ( int i = 0; i < nDim; ++i ) {

        // difference between R and L nodes for component i
        const double uidiff = uNp1R[i] - uNp1L[i];

        // offset into all forms of dudx
        const int offSetI = nDim*i;

        // start sum for NOC contribution
        double GlUidxl = 0.0;
        for ( int l = 0; l< nDim; ++l ) {
          const int offSetIL = offSetI+l;
          const double dxl = coordR[l] - coordL[l];
          const double GlUi = 0.5*(dudxL[offSetIL] + dudxR[offSetIL]);
          GlUidxl += GlUi*dxl;
        }

        // form full tensor dui/dxj with NOC
        for ( int j = 0; j < nDim; ++j ) {
          const int offSetIJ = offSetI+j;
          const double axj = p_areaVec[j];
          const double GjUi = 0.5*(dudxL[offSetIJ] + dudxR[offSetIJ]);
          p_duidxj[offSetIJ] = GjUi + (uidiff - GlUidxl)*axj*inv_axdx;
        }
      }

      // lhs diffusion; only -mu*dui/dxj*Aj contribution for now
      const double dlhsfac = -viscIp*asq*inv_axdx;

      for ( int i = 0; i < nDim; ++i ) {

        // 2nd order central
        const double uiIp = 0.5*(uNp1R[i] + uNp1L[i]);

        // upwind
        const double uiUpwind = (tmdot > 0) ? alphaUpw*p_uIpL[i] + om_alphaUpw*uiIp
          : alphaUpw*p_uIpR[i] + om_alphaUpw*uiIp;

        // generalized central (2nd and 4th order)
        const double uiHatL = alpha*p_uIpL[i] + om_alpha*uiIp;
        const double uiHatR = alpha*p_uIpR[i] + om_alpha*uiIp;
        const double uiCds = 0.5*(uiHatL + uiHatR);

        // total advection; pressure contribution in time term expression
        const double aflux = tmdot*(pecfac*uiUpwind + om_pecfac*uiCds);

        // divU
        double divU = 0.0;
        for ( int j = 0; j < nDim; ++j)
          divU += p_duidxj[j*nDim+j];

        // diffusive flux; viscous tensor doted with area vector
        double dflux = 2.0/3.0*viscIp*divU*p_areaVec[i]*includeDivU_;
        const int offSetI = nDim*i;
        for ( int j = 0; j < nDim; ++j ) {
          const int offSetTrans = nDim*j+i;
          const double axj = p_areaVec[j];
          dflux += -viscIp*(p_duidxj[offSetI+j] + p_duidxj[offSetTrans])*axj;
        }

        // residal for total flux
        const double tflux = aflux + dflux;
        const int indexL = i;
        const int indexR = i + nDim;

        // total flux left
        p_rhs[indexL] -= tflux;
        // total flux right
        p_rhs[indexR] += tflux;

        // setup for LHS
        const int rowL = indexL * nodesPerEdge*nDim;
        const int rowR = indexR * nodesPerEdge*nDim;

        //==============================
        // advection first
        //==============================
        const int rLiL = rowL+indexL;
        const int rLiR = rowL+indexR;
        const int rRiL = rowR+indexL;
        const int rRiR = rowR+indexR;

        // upwind advection (includes 4th); left node
        double alhsfac = 0.5*(tmdot+std::abs(tmdot))*pecfac*alphaUpw
          + 0.5*alpha*om_pecfac*tmdot;
        p_lhs[rLiL] += alhsfac;
        p_lhs[rRiL] -= alhsfac;

        // upwind advection (incldues 4th); right node
        alhsfac = 0.5*(tmdot-std::abs(tmdot))*pecfac*alphaUpw
          + 0.5*alpha*om_pecfac*tmdot;
        p_lhs[rRiR] -= alhsfac;
        p_lhs[rLiR] += alhsfac;

        // central; left; collect terms on alpha and alphaUpw
        alhsfac = 0.5*tmdot*(pecfac*om_alphaUpw + om_pecfac*om_alpha);
        p_lhs[rLiL] += alhsfac;
        p_lhs[rLiR] += alhsfac;
        // central; right
        p_lhs[rRiL] -= alhsfac;
        p_lhs[rRiR] -= alhsfac;

        //==============================
        // diffusion second
        //==============================
        const double axi = p_areaVec[i];

        //diffusion; row IL
        p_lhs[rLiL] -= dlhsfac;
        p_lhs[rLiR] += dlhsfac;

        // diffusion; row IR
        p_lhs[rRiL] += dlhsfac;
        p_lhs[rRiR] -= dlhsfac;

        // more diffusion; see theory manual
        for ( int j = 0; j < nDim; ++j ) {
          const double lhsfacNS = -viscIp*axi*p_areaVec[j]*inv_axdx;

          const int colL = j;
          const int colR = j + nDim;

          // first left; IL,IL; IL,IR
          p_lhs[rowL + colL] -= lhsfacNS;
          p_lhs[rowL + colR] += lhsfacNS;

          // now right, IR,IL; IR,IR
          p_lhs[rowR + colL] += lhsfacNS;
          p_lhs[rowR + colR] -= lhsfacNS;
        }

      }
      
      apply_coeff(connected_nodes, rhs, lhs, __FILE__);

    }
  }
}
//--------------------------------------------------------------------------
//-------- execute ---------------------------------------------------------
//--------------------------------------------------------------------------
void
AssembleScalarEdgeSolverAlgorithm::execute()
{

  stk::mesh::BulkData & bulk_data = realm_.bulk_data();
  stk::mesh::MetaData & meta_data = realm_.meta_data();

  const int nDim = meta_data.spatial_dimension();

  const double small = 1.0e-16;

  // extract user advection options (allow to potentially change over time)
  const std::string dofName = scalarQ_->name();
  const double hybridFactor = realm_.get_hybrid_factor(dofName);
  const double alpha = realm_.get_alpha_factor(dofName);
  const double alphaUpw = realm_.get_alpha_upw_factor(dofName);
  const double hoUpwind = realm_.get_upw_factor(dofName);
  const bool useLimiter = realm_.primitive_uses_limiter(dofName);

  // one minus flavor
  const double om_alpha = 1.0-alpha;
  const double om_alphaUpw = 1.0-alphaUpw;

  // space for LHS/RHS; always edge connectivity
  const int nodesPerEdge = 2;
  const int lhsSize = nodesPerEdge*nodesPerEdge;
  const int rhsSize = nodesPerEdge;
  std::vector<double> lhs(lhsSize);
  std::vector<double> rhs(rhsSize);
  std::vector<stk::mesh::Entity> connected_nodes(2);

  // area vector; gather into
  std::vector<double> areaVec(nDim);

  // pointer for fast access
  double *p_lhs = &lhs[0];
  double *p_rhs = &rhs[0];
  double *p_areaVec = &areaVec[0];

  // deal with state
  ScalarFieldType &scalarQNp1  = scalarQ_->field_of_state(stk::mesh::StateNP1);
  ScalarFieldType &densityNp1 = density_->field_of_state(stk::mesh::StateNP1);

  // define some common selectors
  stk::mesh::Selector s_locally_owned_union = meta_data.locally_owned_part()
    & stk::mesh::selectUnion(partVec_) 
    & !(realm_.get_inactive_selector());

  stk::mesh::BucketVector const& edge_buckets =
    realm_.get_buckets( stk::topology::EDGE_RANK, s_locally_owned_union );
  for ( stk::mesh::BucketVector::const_iterator ib = edge_buckets.begin();
        ib != edge_buckets.end() ; ++ib ) {
    stk::mesh::Bucket & b = **ib ;
    const stk::mesh::Bucket::size_type length   = b.size();

    // pointer to edge area vector and mdot
    const double * av = stk::mesh::field_data(*edgeAreaVec_, b);
    const double * mdot = stk::mesh::field_data(*massFlowRate_, b);

    for ( stk::mesh::Bucket::size_type k = 0 ; k < length ; ++k ) {

      // zeroing of lhs/rhs
      for ( int i = 0; i < lhsSize; ++i ) {
        p_lhs[i] = 0.0;
      }
      for ( int i = 0; i < rhsSize; ++i ) {
        p_rhs[i] = 0.0;
      }

      // get edge
      stk::mesh::Entity edge = b[k];

      stk::mesh::Entity const * edge_node_rels = bulk_data.begin_nodes(edge);

      // sanity check on number or nodes
      ThrowAssert( bulk_data.num_nodes(edge) == 2 );

      // pointer to edge area vector
      for ( int j = 0; j < nDim; ++j )
        p_areaVec[j] = av[k*nDim+j];
      const double tmdot = mdot[k];

      // left and right nodes
      stk::mesh::Entity nodeL = edge_node_rels[0];
      stk::mesh::Entity nodeR = edge_node_rels[1];

      connected_nodes[0] = nodeL;
      connected_nodes[1] = nodeR;

      // extract nodal fields
      const double * coordL = stk::mesh::field_data(*coordinates_, nodeL);
      const double * coordR = stk::mesh::field_data(*coordinates_, nodeR);

      const double * dqdxL = stk::mesh::field_data(*dqdx_, nodeL);
      const double * dqdxR = stk::mesh::field_data(*dqdx_, nodeR);

      const double * vrtmL = stk::mesh::field_data(*velocityRTM_, nodeL);
      const double * vrtmR = stk::mesh::field_data(*velocityRTM_, nodeR);

      const double qNp1L = *stk::mesh::field_data(scalarQNp1, nodeL);
      const double qNp1R = *stk::mesh::field_data(scalarQNp1, nodeR);

      const double densityL = *stk::mesh::field_data(densityNp1, nodeL);
      const double densityR = *stk::mesh::field_data(densityNp1, nodeR);

      const double diffFluxCoeffL = *stk::mesh::field_data(*diffFluxCoeff_, nodeL);
      const double diffFluxCoeffR = *stk::mesh::field_data(*diffFluxCoeff_, nodeR);

      // compute geometry
      double axdx = 0.0;
      double asq = 0.0;
      double udotx = 0.0;
      for ( int j = 0; j < nDim; ++j ) {
        const double axj = p_areaVec[j];
        const double dxj = coordR[j] - coordL[j];
        asq += axj*axj;
        axdx += axj*dxj;
        udotx += 0.5*dxj*(vrtmL[j] + vrtmR[j]);
      }

      const double inv_axdx = 1.0/axdx;

      // ip props
      const double viscIp = 0.5*(diffFluxCoeffL + diffFluxCoeffR);
      const double diffIp = 0.5*(diffFluxCoeffL/densityL + diffFluxCoeffR/densityR);

      // Peclet factor
      double pecfac = hybridFactor*udotx/(diffIp+small);
      pecfac = pecfac*pecfac/(5.0 + pecfac*pecfac);
      const double om_pecfac = 1.0-pecfac;

      // left and right extrapolation; add in diffusion calc
      double dqL = 0.0;
      double dqR = 0.0;
      double nonOrth = 0.0;
      for ( int j = 0; j < nDim; ++j ) {
        const double dxj = coordR[j] - coordL[j];
        dqL += 0.5*dxj*dqdxL[j];
        dqR += 0.5*dxj*dqdxR[j];
        // now non-orth (over-relaxed procedure of Jasek)
        const double axj = p_areaVec[j];
        const double kxj = axj - asq*inv_axdx*dxj;
        const double GjIp = 0.5*(dqdxL[j] + dqdxR[j]);
        nonOrth += -viscIp*kxj*GjIp;
      }

      // add limiter if appropriate
      double limitL = 1.0;
      double limitR = 1.0;
      const double dq = qNp1R - qNp1L;
      if ( useLimiter ) {
        const double dqMl = 2.0*2.0*dqL - dq;
        const double dqMr = 2.0*2.0*dqR - dq;
        limitL = van_leer(dqMl, dq, small);
        limitR = van_leer(dqMr, dq, small);
      }
      
      // extrapolated; for now limit
      const double qIpL = qNp1L + dqL*hoUpwind*limitL;
      const double qIpR = qNp1R - dqR*hoUpwind*limitR;

      //====================================
      // diffusive flux
      //====================================
      double lhsfac = -viscIp*asq*inv_axdx;
      double diffFlux = lhsfac*(qNp1R - qNp1L) + nonOrth;

      // first left
      p_lhs[0] = -lhsfac;
      p_lhs[1] = +lhsfac;
      p_rhs[0] = -diffFlux;

      // now right
      p_lhs[2] = +lhsfac;
      p_lhs[3] = -lhsfac;
      p_rhs[1] = diffFlux;

      //====================================
      // advective flux
      //====================================

      // 2nd order central
      const double qIp = 0.5*( qNp1L + qNp1R );

      // upwind
      const double qUpwind = (tmdot > 0) ? alphaUpw*qIpL + om_alphaUpw*qIp
          : alphaUpw*qIpR + om_alphaUpw*qIp;

      // generalized central (2nd and 4th order)
      const double qHatL = alpha*qIpL + om_alpha*qIp;
      const double qHatR = alpha*qIpR + om_alpha*qIp;
      const double qCds = 0.5*(qHatL + qHatR);

      // total advection
      const double aflux = tmdot*(pecfac*qUpwind + om_pecfac*qCds);

      // upwind advection (includes 4th); left node
      double alhsfac = 0.5*(tmdot+std::abs(tmdot))*pecfac*alphaUpw
        + 0.5*alpha*om_pecfac*tmdot;
      p_lhs[0] += alhsfac;
      p_lhs[2] -= alhsfac;

      // upwind advection; right node
      alhsfac = 0.5*(tmdot-std::abs(tmdot))*pecfac*alphaUpw
        + 0.5*alpha*om_pecfac*tmdot;
      p_lhs[3] -= alhsfac;
      p_lhs[1] += alhsfac;

      // central; left; collect terms on alpha and alphaUpw
      alhsfac = 0.5*tmdot*(pecfac*om_alphaUpw + om_pecfac*om_alpha);
      p_lhs[0] += alhsfac;
      p_lhs[1] += alhsfac;
      // central; right; collect terms on alpha and alphaUpw
      p_lhs[2] -= alhsfac;
      p_lhs[3] -= alhsfac;

      // total flux left
      p_rhs[0] -= aflux;
      // total flux right
      p_rhs[1] += aflux;

      apply_coeff(connected_nodes, rhs, lhs, __FILE__);

    }
  }
}
//--------------------------------------------------------------------------
//-------- execute ---------------------------------------------------------
//--------------------------------------------------------------------------
void
AssembleScalarElemSolverAlgorithm::execute()
{

  stk::mesh::BulkData & bulk_data = realm_.bulk_data();
  stk::mesh::MetaData & meta_data = realm_.meta_data();

  const int nDim = meta_data.spatial_dimension();
  const double small = 1.0e-16;


  // extract user advection options (allow to potentially change over time)
  const std::string dofName = scalarQ_->name();
  const double hybridFactor = realm_.get_hybrid_factor(dofName);
  const double alpha = realm_.get_alpha_factor(dofName);
  const double alphaUpw = realm_.get_alpha_upw_factor(dofName);
  const double hoUpwind = realm_.get_upw_factor(dofName);
  const bool useLimiter = realm_.primitive_uses_limiter(dofName);

  // one minus flavor..
  const double om_alpha = 1.0-alpha;
  const double om_alphaUpw = 1.0-alphaUpw;

  // space for LHS/RHS; nodesPerElem*nodesPerElem* and nodesPerElem
  std::vector<double> lhs;
  std::vector<double> rhs;
  std::vector<stk::mesh::Entity> connected_nodes;

  // supplemental algorithm size and setup
  const size_t supplementalAlgSize = supplementalAlg_.size();
  for ( size_t i = 0; i < supplementalAlgSize; ++i )
    supplementalAlg_[i]->setup();

  // nodal fields to gather
  std::vector<double> ws_velocityNp1;
  std::vector<double> ws_meshVelocity;
  std::vector<double> ws_vrtm;
  std::vector<double> ws_coordinates;
  std::vector<double> ws_scalarQNp1;
  std::vector<double> ws_dqdx;
  std::vector<double> ws_density;
  std::vector<double> ws_diffFluxCoeff;

  // geometry related to populate
  std::vector<double> ws_scs_areav;
  std::vector<double> ws_dndx;
  std::vector<double> ws_deriv;
  std::vector<double> ws_det_j;
  std::vector<double> ws_shape_function;

  // ip values
  std::vector<double>coordIp(nDim);

  // pointers
  double *p_coordIp = &coordIp[0];

  // deal with state
  ScalarFieldType &scalarQNp1   = scalarQ_->field_of_state(stk::mesh::StateNP1);
  VectorFieldType &velocityNp1 = velocity_->field_of_state(stk::mesh::StateNP1);
  ScalarFieldType &densityNp1 = density_->field_of_state(stk::mesh::StateNP1);

  // define some common selectors
  stk::mesh::Selector s_locally_owned_union = meta_data.locally_owned_part()
    &stk::mesh::selectUnion(partVec_);

  stk::mesh::BucketVector const& elem_buckets =
    realm_.get_buckets( stk::topology::ELEMENT_RANK, s_locally_owned_union );
  for ( stk::mesh::BucketVector::const_iterator ib = elem_buckets.begin();
        ib != elem_buckets.end() ; ++ib ) {
    stk::mesh::Bucket & b = **ib ;
    const stk::mesh::Bucket::size_type length   = b.size();

    // extract master element
    MasterElement *meSCS = realm_.get_surface_master_element(b.topology());

    // extract master element specifics
    const int nodesPerElement = meSCS->nodesPerElement_;
    const int numScsIp = meSCS->numIntPoints_;
    const int *lrscv = meSCS->adjacentNodes();

    // resize some things; matrix related
    const int lhsSize = nodesPerElement*nodesPerElement;
    const int rhsSize = nodesPerElement;
    lhs.resize(lhsSize);
    rhs.resize(rhsSize);
    connected_nodes.resize(nodesPerElement);

    // algorithm related
    ws_velocityNp1.resize(nodesPerElement*nDim);
    ws_meshVelocity.resize(nodesPerElement*nDim);
    ws_vrtm.resize(nodesPerElement*nDim);
    ws_coordinates.resize(nodesPerElement*nDim);
    ws_dqdx.resize(nodesPerElement*nDim);
    ws_scalarQNp1.resize(nodesPerElement);
    ws_density.resize(nodesPerElement);
    ws_diffFluxCoeff.resize(nodesPerElement);
    ws_scs_areav.resize(numScsIp*nDim);
    ws_dndx.resize(nDim*numScsIp*nodesPerElement);
    ws_deriv.resize(nDim*numScsIp*nodesPerElement);
    ws_det_j.resize(numScsIp);
    ws_shape_function.resize(numScsIp*nodesPerElement);

    // pointer to lhs/rhs
    double *p_lhs = &lhs[0];
    double *p_rhs = &rhs[0];
    double *p_velocityNp1 = &ws_velocityNp1[0];
    double *p_meshVelocity = &ws_meshVelocity[0];
    double *p_vrtm = &ws_vrtm[0];
    double *p_coordinates = &ws_coordinates[0];
    double *p_dqdx = &ws_dqdx[0];
    double *p_scalarQNp1 = &ws_scalarQNp1[0];
    double *p_density = &ws_density[0];
    double *p_diffFluxCoeff = &ws_diffFluxCoeff[0];
    double *p_scs_areav = &ws_scs_areav[0];
    double *p_dndx = &ws_dndx[0];
    double *p_shape_function = &ws_shape_function[0];

    // extract shape function
    meSCS->shape_fcn(&p_shape_function[0]);

    for ( stk::mesh::Bucket::size_type k = 0 ; k < length ; ++k ) {
      // get elem
      stk::mesh::Entity elem = b[k];

      // zero lhs/rhs
      for ( int p = 0; p < lhsSize; ++p )
        p_lhs[p] = 0.0;
      for ( int p = 0; p < rhsSize; ++p )
        p_rhs[p] = 0.0;


      // ip data for this element; scs and scv
      const double *mdot = stk::mesh::field_data(*massFlowRate_, elem );

      //===============================================
      // gather nodal data; this is how we do it now..
      //===============================================
      stk::mesh::Entity const * node_rels = bulk_data.begin_nodes(elem);
      int num_nodes = bulk_data.num_nodes(elem);

      // sanity check on num nodes
      ThrowAssert( num_nodes == nodesPerElement );

      for ( int ni = 0; ni < num_nodes; ++ni ) {
        stk::mesh::Entity node = node_rels[ni];

        // set connected nodes
        connected_nodes[ni] = node;

        // pointers to real data
        const double * uNp1   = stk::mesh::field_data(velocityNp1, node );
        const double * vNp1   = stk::mesh::field_data(*meshVelocity_, node);
        const double * coords = stk::mesh::field_data(*coordinates_, node );
        const double * dq     = stk::mesh::field_data(*dqdx_, node );

        // gather scalars
        p_scalarQNp1[ni]    = *stk::mesh::field_data(scalarQNp1, node );
        p_density[ni]       = *stk::mesh::field_data(densityNp1, node );
        p_diffFluxCoeff[ni] = *stk::mesh::field_data(*diffFluxCoeff_, node );

        // gather vectors
        const int niNdim = ni*nDim;
        for ( int i=0; i < nDim; ++i ) {
          p_velocityNp1[niNdim+i] = uNp1[i];
          p_vrtm[niNdim+i] = uNp1[i];
          p_meshVelocity[niNdim+i] = vNp1[i];
          p_coordinates[niNdim+i] = coords[i];
          p_dqdx[niNdim+i] = dq[i];
        }
      }

      // compute geometry
      double scs_error = 0.0;
      meSCS->determinant(1, &p_coordinates[0], &p_scs_areav[0], &scs_error);

      // compute dndx
      meSCS->grad_op(1, &p_coordinates[0], &p_dndx[0], &ws_deriv[0], &ws_det_j[0], &scs_error);

      // manage velocity relative to mesh
      if ( meshMotion_ ) {
        const int kSize = num_nodes*nDim;
        for ( int k = 0; k < kSize; ++k ) {
          p_vrtm[k] -= p_meshVelocity[k];
        }
      }

      for ( int ip = 0; ip < numScsIp; ++ip ) {

        // left and right nodes for this ip
        const int il = lrscv[2*ip];
        const int ir = lrscv[2*ip+1];

        // corresponding matrix rows
        const int rowL = il*nodesPerElement;
        const int rowR = ir*nodesPerElement;

        // save off mdot
        const double tmdot = mdot[ip];

        // zero out values of interest for this ip
        for ( int j = 0; j < nDim; ++j ) {
          p_coordIp[j] = 0.0;
        }

        // save off ip values; offset to Shape Function
        double rhoIp = 0.0;
        double muIp = 0.0;
        double qIp = 0.0;
        const int offSetSF = ip*nodesPerElement;
        for ( int ic = 0; ic < nodesPerElement; ++ic ) {
          const double r = p_shape_function[offSetSF+ic];
          rhoIp += r*p_density[ic];
          muIp += r*p_diffFluxCoeff[ic];
          qIp += r*p_scalarQNp1[ic];
          // compute scs point values
          for ( int i = 0; i < nDim; ++i ) {
            p_coordIp[i] += r*p_coordinates[ic*nDim+i];
          }
        }

        // Peclet factor; along the edge
        const double diffIp = 0.5*(p_diffFluxCoeff[il]/p_density[il]
                                   + p_diffFluxCoeff[ir]/p_density[ir]);
        double udotx = 0.0;
        for(int j = 0; j < nDim; ++j ) {
          const double dxj = p_coordinates[ir*nDim+j]-p_coordinates[il*nDim+j];
          const double uj = 0.5*(p_vrtm[il*nDim+j] + p_vrtm[ir*nDim+j]);
          udotx += uj*dxj;
        }
        double pecfac = hybridFactor*udotx/(diffIp+small);
        pecfac = pecfac*pecfac/(5.0 + pecfac*pecfac);
        const double om_pecfac = 1.0-pecfac;

        // left and right extrapolation
        double dqL = 0.0;
        double dqR = 0.0;
        for(int j = 0; j < nDim; ++j ) {
          const double dxjL = p_coordIp[j] - p_coordinates[il*nDim+j];
          const double dxjR = p_coordinates[ir*nDim+j] - p_coordIp[j];
          dqL += dxjL*p_dqdx[nDim*il+j];
          dqR += dxjR*p_dqdx[nDim*ir+j];
        }

        // add limiter if appropriate
        double limitL = 1.0;
        double limitR = 1.0;
        if ( useLimiter ) {
          const double dq = p_scalarQNp1[ir] - p_scalarQNp1[il];
          const double dqMl = 2.0*2.0*dqL - dq;
          const double dqMr = 2.0*2.0*dqR - dq;
          limitL = van_leer(dqMl, dq, small);
          limitR = van_leer(dqMr, dq, small);
        }
        
        // extrapolated; for now limit (along edge is fine)
        const double qIpL = p_scalarQNp1[il] + dqL*hoUpwind*limitL;
        const double qIpR = p_scalarQNp1[ir] - dqR*hoUpwind*limitR;

        // assemble advection; rhs and upwind contributions

        // 2nd order central; simply qIp from above

        // upwind
        const double qUpwind = (tmdot > 0) ? alphaUpw*qIpL + om_alphaUpw*qIp
            : alphaUpw*qIpR + om_alphaUpw*qIp;

        // generalized central (2nd and 4th order)
        const double qHatL = alpha*qIpL + om_alpha*qIp;
        const double qHatR = alpha*qIpR + om_alpha*qIp;
        const double qCds = 0.5*(qHatL + qHatR);

        // total advection
        const double aflux = tmdot*(pecfac*qUpwind + om_pecfac*qCds);

        // right hand side; L and R
        p_rhs[il] -= aflux;
        p_rhs[ir] += aflux;

        // advection operator sens; all but central

        // upwind advection (includes 4th); left node
        const double alhsfacL = 0.5*(tmdot+std::abs(tmdot))*pecfac*alphaUpw
          + 0.5*alpha*om_pecfac*tmdot;
        p_lhs[rowL+il] += alhsfacL;
        p_lhs[rowR+il] -= alhsfacL;

        // upwind advection; right node
        const double alhsfacR = 0.5*(tmdot-std::abs(tmdot))*pecfac*alphaUpw
          + 0.5*alpha*om_pecfac*tmdot;
        p_lhs[rowR+ir] -= alhsfacR;
        p_lhs[rowL+ir] += alhsfacR;

        double qDiff = 0.0;
        for ( int ic = 0; ic < nodesPerElement; ++ic ) {

          // shape function
          const double r = p_shape_function[offSetSF+ic];

          // upwind (il/ir) handled above; collect terms on alpha and alphaUpw
          const double lhsfacAdv = r*tmdot*(pecfac*om_alphaUpw + om_pecfac*om_alpha);

          // advection operator lhs; rhs handled above
          // lhs; il then ir
          p_lhs[rowL+ic] += lhsfacAdv;
          p_lhs[rowR+ic] -= lhsfacAdv;

          // diffusion
          double lhsfacDiff = 0.0;
          const int offSetDnDx = nDim*nodesPerElement*ip + ic*nDim;
          for ( int j = 0; j < nDim; ++j ) {
            lhsfacDiff += -muIp*p_dndx[offSetDnDx+j]*p_scs_areav[ip*nDim+j];
          }

          qDiff += lhsfacDiff*p_scalarQNp1[ic];

          // lhs; il then ir
          p_lhs[rowL+ic] += lhsfacDiff;
          p_lhs[rowR+ic] -= lhsfacDiff;
        }

        // rhs; il then ir
        p_rhs[il] -= qDiff;
        p_rhs[ir] += qDiff;

      }

      // call supplemental
      for ( size_t i = 0; i < supplementalAlgSize; ++i )
        supplementalAlg_[i]->elem_execute( nodesPerElement, numScsIp, &lhs[0], &rhs[0], elem);

      apply_coeff(connected_nodes, rhs, lhs, __FILE__);

    }
  }
}
//--------------------------------------------------------------------------
//-------- execute ---------------------------------------------------------
//--------------------------------------------------------------------------
void
AssembleMomentumEdgeContactSolverAlgorithm::execute()
{

  stk::mesh::MetaData & meta_data = realm_.meta_data();
  stk::mesh::BulkData & bulk_data = realm_.bulk_data();

  const int nDim = meta_data.spatial_dimension();

  const double small = 1.0e-16;

  // extract user advection options (allow to potentially change over time)
  const std::string dofName = "velocity";
  const double alpha = realm_.get_alpha_factor(dofName);
  const double alphaUpw = realm_.get_alpha_upw_factor(dofName);
  const double hoUpwind = realm_.get_upw_factor(dofName);

  const bool useLimiter = realm_.primitive_uses_limiter(dofName);
  // one minus flavor
  const double om_alpha = 1.0-alpha;
  const double om_alphaUpw = 1.0-alphaUpw;

  // space for LHS/RHS; (nodesPerElem+1)*nDim*(nodesPerElem+1)*nDim; (nodesPerElem+1)*nDim
  std::vector<double> lhs;
  std::vector<double> rhs;
  std::vector<stk::mesh::Entity> connected_nodes;

  // space for dui/dxj. This variable is the modifed gradient with NOC
  std::vector<double> duidxj(nDim*nDim);
  
  // extrapolated value from the L/R direction 
  std::vector<double> uIpL(nDim);
  std::vector<double> uIpR(nDim);
  
  // limiter values from the L/R direction, 0:1
  std::vector<double> limitL(nDim,1.0); 
  std::vector<double> limitR(nDim,1.0);
  
  // extrapolated gradient from L/R direction
  std::vector<double> duL(nDim);
  std::vector<double> duR(nDim);
  
  // pointers for fast access
  double *p_duidxj = &duidxj[0];
  double *p_uIpL = &uIpL[0];
  double *p_uIpR = &uIpR[0];
  double *p_limitL = &limitL[0];
  double *p_limitR = &limitR[0];
  double *p_duL = &duL[0];
  double *p_duR = &duR[0];

  // space for interpolated right state (halo)
  double densityR;
  double viscosityR;
  std::vector<double> uNp1R(nDim);
  std::vector<double> dudxR(nDim*nDim);

  // interpolate nodal values to point-in-elem
  const int sizeOfScalarField = 1;
  const int sizeOfVectorField = nDim;
  const int sizeOfTensorField = nDim*nDim;
  
  // deal with state
  VectorFieldType &velocityNp1 = velocity_->field_of_state(stk::mesh::StateNP1);
  ScalarFieldType &densityNp1 = density_->field_of_state(stk::mesh::StateNP1);

  // mesh motion
  std::vector<double> vrtmL(nDim);
  std::vector<double> vrtmR(nDim);
  double * p_vrtmL = &vrtmL[0];
  double * p_vrtmR = &vrtmR[0];

  // parallel communicate ghosted entities
  if ( NULL != realm_.contactManager_->contactGhosting_ )
    stk::mesh::communicate_field_data(*(realm_.contactManager_->contactGhosting_), ghostFieldVec_);
  
  // iterate contactInfoVec_
  std::vector<ContactInfo *>::iterator ii;
  for( ii=realm_.contactManager_->contactInfoVec_.begin();
       ii!=realm_.contactManager_->contactInfoVec_.end(); ++ii ) {
    
    // get master element type for this contactInfo
    MasterElement *meSCS  = (*ii)->meSCS_;
    const int nodesPerElement = meSCS->nodesPerElement_;
    std::vector<double> elemNodalP(nodesPerElement);
    std::vector<double> elemNodalUnp1(nDim*nodesPerElement);
    std::vector<double> elemNodalVisc(nodesPerElement);
    std::vector<double> elemNodalRho(nodesPerElement);
    std::vector<double> elemNodalDudx(nDim*nDim*nodesPerElement);
    std::vector<double> shpfc(nodesPerElement);

    // resize some things; matrix related
    const int npePlusOne = nodesPerElement+1;
    const int lhsSize = npePlusOne*nDim*npePlusOne*nDim;
    const int rhsSize = npePlusOne*nDim;
    lhs.resize(lhsSize);
    rhs.resize(rhsSize);
    connected_nodes.resize(npePlusOne);

    // pointer to lhs/rhs
    double *p_lhs = &lhs[0];
    double *p_rhs = &rhs[0];

    // scaling for lhs
    const double inv_nodesPerElement = 1.0/double(nodesPerElement);

    // iterate halo face nodes
    std::map<uint64_t, HaloInfo *>::iterator iterHalo;
    for (iterHalo  = (*ii)->haloInfoMap_.begin();
         iterHalo != (*ii)->haloInfoMap_.end();
         ++iterHalo) {

      // halo info object of interest
      HaloInfo * infoObject = (*iterHalo).second;

      // zeroing of lhs/rhs
      for ( int k = 0; k < lhsSize; ++k ) {
        p_lhs[k] = 0.0;
      }
      for ( int k = 0; k < rhsSize; ++k ) {
        p_rhs[k] = 0.0;
      }

      // pointer to edge area vector and mdot
      const double *p_areaVec = &infoObject->haloEdgeAreaVec_[0];
      const double tmdot = *stk::mesh::field_data(*haloMdot_, infoObject->faceNode_);

      // extract element mesh object and global id for face node
      stk::mesh::Entity elem  = infoObject->owningElement_;

      stk::mesh::Entity const* elem_node_rels = bulk_data.begin_nodes(elem);
      const int num_nodes = bulk_data.num_nodes(elem);

      // now load the elemental values for future interpolation; fill in connected nodes
      connected_nodes[0] = infoObject->faceNode_;
      for ( int ni = 0; ni < num_nodes; ++ni ) {
        stk::mesh::Entity node = elem_node_rels[ni];
        connected_nodes[ni+1] = node;

        elemNodalRho[ni] = *stk::mesh::field_data(densityNp1, node);
        elemNodalVisc[ni] = *stk::mesh::field_data(*viscosity_, node);

        // load up vectors/tensor
        const double *uNp1 = stk::mesh::field_data(velocityNp1, node );
        const double *dudx = stk::mesh::field_data(*dudx_, node );
        for ( int i = 0; i < nDim; ++i ) {
          const int offSet = i*nodesPerElement + ni;
          elemNodalUnp1[offSet] = uNp1[i];

          const int rowI = i*nDim;
          const int offSetT = i*nodesPerElement*nDim;
          for ( int j = 0; j < nDim; ++j ) {
            elemNodalDudx[offSetT+j*nodesPerElement+ni] = dudx[rowI+j];
          }
        }
      }

      // extract nodal fields; right state is Halo and requires inperpolation
      const double *coordL = stk::mesh::field_data(*coordinates_, infoObject->faceNode_);
      const double *coordR = &infoObject->haloNodalCoords_[0];

      const double *dudxL = stk::mesh::field_data(*dudx_, infoObject->faceNode_);
      meSCS->interpolatePoint(
        sizeOfTensorField,
        &(infoObject->isoParCoords_[0]),
        &elemNodalDudx[0],
        &(dudxR[0]));

      const double *uNp1L = stk::mesh::field_data(velocityNp1, infoObject->faceNode_);
      meSCS->interpolatePoint(
        sizeOfVectorField,
        &(infoObject->isoParCoords_[0]),
        &elemNodalUnp1[0],
        &(uNp1R[0]));

      const double densityL = *stk::mesh::field_data(densityNp1, infoObject->faceNode_);
      meSCS->interpolatePoint(
        sizeOfScalarField,
        &(infoObject->isoParCoords_[0]),
        &elemNodalRho[0],
        &densityR);

      const double viscosityL = *stk::mesh::field_data(*viscosity_, infoObject->faceNode_);
      meSCS->interpolatePoint(
        sizeOfScalarField,
        &(infoObject->isoParCoords_[0]),
        &elemNodalVisc[0],
        &viscosityR);

      // copy to velocity relative to mesh; squeeze in extrapolated values
      for ( int i = 0; i < nDim; ++i ) {
        p_vrtmL[i] = uNp1L[i];
        p_vrtmR[i] = uNp1R[i];	
        // extrapolated du
        p_duL[i] = 0.0;
        p_duR[i] = 0.0;
        const int offSet = nDim*i;
        for ( int j = 0; j < nDim; ++j ) {
          const double dxj = 0.5*(coordR[j] - coordL[j]);
          p_duL[i] += dxj*dudxL[offSet+j];
          p_duR[i] += dxj*dudxR[offSet+j];
        }
      }

      // deal with mesh motion
      if ( meshMotion_ ) {
        const double * meshVelocityL = stk::mesh::field_data(*meshVelocity_, infoObject->faceNode_);
        const double * meshVelocityR = &(infoObject->haloMeshVelocity_[0]);
        for (int j = 0; j < nDim; ++j ) {
          p_vrtmL[j] -= meshVelocityL[j];
          p_vrtmR[j] -= meshVelocityR[j];
        }
      }

      // compute geometry
      double axdx = 0.0;
      double asq = 0.0;
      double udotx = 0.0;
      for (int j = 0; j < nDim; ++j ) {
        const double axj = p_areaVec[j];
        const double dxj = coordR[j] - coordL[j];
        axdx += axj*dxj;
        asq += axj*axj;
        udotx += 0.5*dxj*(p_vrtmL[j] + p_vrtmR[j]);
      }

      const double inv_axdx = 1.0/axdx;

      // ip props
      const double viscIp = 0.5*(viscosityL + viscosityR);
      const double diffIp = 0.5*(viscosityL/densityL + viscosityR/densityR);

      // Peclet factor
      const double pecfac = pecletFunction_->execute(std::abs(udotx)/(diffIp+small));
      const double om_pecfac = 1.0-pecfac;

      // determine limiter if applicable
      if ( useLimiter ) {
        for ( int i = 0; i < nDim; ++i ) {
          const double dq = uNp1R[i] - uNp1L[i];
          const double dqMl = 2.0*2.0*p_duL[i] - dq;
          const double dqMr = 2.0*2.0*p_duR[i] - dq;
          p_limitL[i] = van_leer(dqMl, dq, small);
          p_limitR[i] = van_leer(dqMr, dq, small);
        }
      }

      // final upwind extrapolation; with limiter
      for ( int i = 0; i < nDim; ++i ) {
        p_uIpL[i] = uNp1L[i] + p_duL[i]*hoUpwind*p_limitL[i];
        p_uIpR[i] = uNp1R[i] - p_duR[i]*hoUpwind*p_limitR[i];
      }
      
      /*
        form duidxj with over-relaxed procedure of Jasak:

        dui/dxj = GjUi +[(uiR - uiL) - GlUi*dxl]*Aj/AxDx
        where Gp is the interpolated pth nodal gradient for ui
      */
      for ( int i = 0; i < nDim; ++i ) {

        // difference between R and L nodes for component i
        const double uidiff = uNp1R[i] - uNp1L[i];

        // offset into all forms of dudx
        const int offSetI = nDim*i;

        // start sum for NOC contribution
        double GlUidxl = 0.0;
        for ( int l = 0; l< nDim; ++l ) {
          const int offSetIL = offSetI+l;
          const double dxl = coordR[l] - coordL[l];
          const double GlUi = 0.5*(dudxL[offSetIL] + dudxR[offSetIL]);
          GlUidxl += GlUi*dxl;
        }

        // form full tensor dui/dxj with NOC
        for ( int j = 0; j < nDim; ++j ) {
          const int offSetIJ = offSetI+j;
          const double axj = p_areaVec[j];
          const double GjUi = 0.5*(dudxL[offSetIJ] + dudxR[offSetIJ]);
          p_duidxj[offSetIJ] = GjUi + (uidiff - GlUidxl)*axj*inv_axdx;
        }
      }

      // divU
      double divU = 0.0;
      for ( int j = 0; j < nDim; ++j)
        divU += p_duidxj[j*nDim+j];

      // lhs diffusion; only -mu*dui/dxj*Aj contribution for now
      const double dlhsfac = -viscIp*asq*inv_axdx;

      for ( int i = 0; i < nDim; ++i ) {

        // 2nd order central
        const double uiIp = 0.5*(uNp1R[i] + uNp1L[i]);

        // upwind
        const double uiUpwind = (tmdot > 0) ? alphaUpw*p_uIpL[i] + om_alphaUpw*uiIp
          : alphaUpw*p_uIpR[i] + om_alphaUpw*uiIp;

        // generalized central (2nd and 4th order)
        const double uiHatL = alpha*p_uIpL[i] + om_alpha*uiIp;
        const double uiHatR = alpha*p_uIpR[i] + om_alpha*uiIp;
        const double uiCds  = 0.5*(uiHatL + uiHatR);

        // total advection; pressure contribution in time term expression
        const double aflux = tmdot*(pecfac*uiUpwind + om_pecfac*uiCds);

        // diffusive flux; viscous tensor doted with area vector
        double dflux = 2.0/3.0*viscIp*divU*p_areaVec[i]*includeDivU_;
        const int offSetI = nDim*i;
        for ( int j = 0; j < nDim; ++j ) {
          const int offSetTrans = nDim*j+i;
          const double axj = p_areaVec[j];
          dflux += -viscIp*(p_duidxj[offSetI+j] + p_duidxj[offSetTrans])*axj;
        }

        // residal for total flux
        meSCS->general_shape_fcn(1, &(infoObject->isoParCoords_[0]), &shpfc[0]);

        const double tflux = aflux + dflux;
        const int indexL = i;

        // setup for LHS; row left is easy
        const int rowL = indexL * npePlusOne * nDim;
        const int rLiL = rowL+indexL;

        // total flux left
        p_rhs[indexL] -= tflux;

        // for ease of reading, scale left node by nodesPerElement
        for ( int ni = 0; ni < num_nodes; ++ni ) {

          const int indexR = i + nDim*(ni+1);
          const int rLiR = rowL+indexR;

          //==============================
          // advection first
          //==============================

          // upwind advection (includes 4th); left node
          double alhsfac = 0.5*(tmdot+std::abs(tmdot))*pecfac*alphaUpw
            + 0.5*alpha*om_pecfac*tmdot;
          p_lhs[rLiL] += alhsfac*inv_nodesPerElement;

          // upwind advection (incldues 4th); right node
          alhsfac = 0.5*(tmdot-std::abs(tmdot))*pecfac*alphaUpw
            + 0.5*alpha*om_pecfac*tmdot;
          p_lhs[rLiR] += alhsfac*shpfc[ni];

          // central; left; collect terms on alpha and alphaUpw
          alhsfac = 0.5*tmdot*(pecfac*om_alphaUpw + om_pecfac*om_alpha);
          p_lhs[rLiL] += alhsfac*inv_nodesPerElement;
          p_lhs[rLiR] += alhsfac*shpfc[ni];
          // central; right n/a

          //==============================
          // diffusion second
          //==============================
          const double axi = p_areaVec[i];

          //diffusion; row IL
          p_lhs[rLiL] -= dlhsfac*inv_nodesPerElement;
          p_lhs[rLiR] += dlhsfac*shpfc[ni];

          // diffusion; row IR; n/a

          // more diffusion; see theory manual
          for ( int j = 0; j < nDim; ++j ) {
            const double lhsfacNS = -viscIp*axi*p_areaVec[j]*inv_axdx;

            const int colL = j;
            const int colR = j + nDim*(ni+1);

            // first left; IL,IL; IL,IR
            p_lhs[rowL + colL] -= lhsfacNS*inv_nodesPerElement;
            p_lhs[rowL + colR] += lhsfacNS*shpfc[ni];

            // now right, IR,IL; IR,IR; n/a
          }
        }
      }

      // apply to linear system
      apply_coeff(connected_nodes, rhs, lhs, __FILE__);

    }
  }
}