KOKKOS_INLINE_FUNCTION
  void operator()( size_type inode ) const
  {
    //  Apply a dirichlet boundary condition to 'irow'
    //  to maintain the symmetry of the original 
    //  global stiffness matrix, zero out the columns
    //  that correspond to boundary conditions, and
    //  adjust the load vector accordingly

    const size_type iBeg = matrix.graph.row_map[inode];
    const size_type iEnd = matrix.graph.row_map[inode+1];

    const ScalarCoordType z = node_coords(inode,2);
    const bool bc_lower = z <= bc_lower_z ;
    const bool bc_upper = bc_upper_z <= z ;

    if ( bc_lower || bc_upper ) {
      const ScalarType bc_value = bc_lower ? bc_lower_value
                                           : bc_upper_value ;

      rhs(inode) = bc_value ; //  set the rhs vector

      //  zero each value on the row, and leave a one
      //  on the diagonal

      for( size_type i = iBeg ; i < iEnd ; i++) {
        matrix.coefficients(i) =
          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
      }
    }
    else {
      //  Find any columns that are boundary conditions.
      //  Clear them and adjust the load vector

      for( size_type i = iBeg ; i < iEnd ; i++ ) {
        const size_type cnode = matrix.graph.entries(i) ;

        const ScalarCoordType zc = node_coords(cnode,2);
        const bool c_bc_lower = zc <= bc_lower_z ;
        const bool c_bc_upper = bc_upper_z <= zc ;

        if ( c_bc_lower || c_bc_upper ) {

          const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
                                                   : bc_upper_value ;

          rhs( inode ) -= c_bc_value * matrix.coefficients(i);

          matrix.coefficients(i) = 0 ;
        }
      }
    }
  }
  inline
  void operator()( const size_type iRow ) const
  {
    const size_type iEntryBegin = m_A.graph.row_map[iRow];
    const size_type iEntryEnd   = m_A.graph.row_map[iRow+1];

    double sum = 0 ;

#if defined( __INTEL_COMPILER )
#pragma simd reduction(+:sum)
#pragma ivdep
    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
    }
#else
    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
    }
#endif

    m_y(iRow) = sum ;
  }