KOKKOS_INLINE_FUNCTION
  void
  operator()(const member_type & teamMember) const {

    const unsigned int sharedMemorySize = _tileSize + 2;

    Kokkos::View< double**
                , Kokkos::LayoutRight
                , execution_space::scratch_memory_space
                , Kokkos::MemoryUnmanaged
                > shared( teamMember.team_shmem(),
                          sharedMemorySize,
                          sharedMemorySize);

    const unsigned int tileIndex = teamMember.league_rank();
    const unsigned int tileRow = tileIndex / _numberOfTilesPerSide;
    const unsigned int tileCol = tileIndex % _numberOfTilesPerSide;

    const unsigned int sharedRowSource = tileRow * _tileSize;
    const unsigned int sharedColSource = tileCol * _tileSize;

    // load shared memory
    Kokkos::parallel_for
      (Kokkos::TeamThreadRange(teamMember, sharedMemorySize * sharedMemorySize),
       [=] (const unsigned int index) {
        const unsigned int i = index / sharedMemorySize;
        const unsigned int j = index % sharedMemorySize;
        shared(i, j) = _u(_t, sharedRowSource + i, sharedColSource + j);
      });
    teamMember.team_barrier();

    // these are indices into shared
    const unsigned int iShared = teamMember.team_rank() / _tileSize + 1;
    const unsigned int jShared = teamMember.team_rank() % _tileSize + 1;
    const unsigned int i = tileRow * _tileSize + iShared;
    const unsigned int j = tileCol * _tileSize + jShared;

    // do the calculation
    const double utij = shared(iShared, jShared);
    _u(_tp1, i, j) =
      (2 - 4 * _courant2) * utij
      - _u(_tp1, i, j)
      + _courant2 * (1*shared(iShared+1, jShared)
                     + shared(iShared-1, jShared)
                     + shared(iShared, jShared+1)
                     + shared(iShared, jShared-1));
  }