KOKKOS_INLINE_FUNCTION
  void
  operator()(const member_type & teamMember) const {

    const unsigned int sharedMemorySize = _tileSize + 2;

    Kokkos::View< double**
                , Kokkos::LayoutRight
                , execution_space::scratch_memory_space
                , Kokkos::MemoryUnmanaged
                > shared( teamMember.team_shmem(),
                          sharedMemorySize,
                          sharedMemorySize);

    const unsigned int tileIndex = teamMember.league_rank();
    const unsigned int tileRow = tileIndex / _numberOfTilesPerSide;
    const unsigned int tileCol = tileIndex % _numberOfTilesPerSide;

    const unsigned int sharedRowSource = tileRow * _tileSize;
    const unsigned int sharedColSource = tileCol * _tileSize;

    // load shared memory
    Kokkos::parallel_for
      (Kokkos::TeamThreadRange(teamMember, sharedMemorySize * sharedMemorySize),
       [=] (const unsigned int index) {
        const unsigned int i = index / sharedMemorySize;
        const unsigned int j = index % sharedMemorySize;
        shared(i, j) = _u(_t, sharedRowSource + i, sharedColSource + j);
      });
    teamMember.team_barrier();

    // these are indices into shared
    const unsigned int iShared = teamMember.team_rank() / _tileSize + 1;
    const unsigned int jShared = teamMember.team_rank() % _tileSize + 1;
    const unsigned int i = tileRow * _tileSize + iShared;
    const unsigned int j = tileCol * _tileSize + jShared;

    // do the calculation
    const double utij = shared(iShared, jShared);
    _u(_tp1, i, j) =
      (2 - 4 * _courant2) * utij
      - _u(_tp1, i, j)
      + _courant2 * (1*shared(iShared+1, jShared)
                     + shared(iShared-1, jShared)
                     + shared(iShared, jShared+1)
                     + shared(iShared, jShared-1));
  }
 // task team interface
 void apply(const member_type &member, value_type &r_val) {
   for (int iter=0;iter<BIG;++iter) {
     Kokkos::parallel_for(Kokkos::TeamThreadRange(member, SMALL),
                          [&](const long i) {
                            double tmp = 0.0;
                            for (long j=0;j<TINY;++j)
                              tmp += j;
                            _dummy[i] += (tmp + 1);
                          });
     if (_use_barrier)
       member.team_barrier();
   }
 }
    // task team interface
    KOKKOS_INLINE_FUNCTION
    void apply(const member_type &member, value_type &r_val) {
      const int offset = _itask*SMALL;
      auto a = &_a[offset];
      auto b = &_b[offset];

      for (int iter=0;iter<BIG;++iter) {
        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, SMALL),
                             [&](const int i) {
                               ValueType tmp = 0.0;
                               for (int  j=0;j<TINY;++j)
                                 tmp += j;
                               a[i] = b[i] + (tmp + 1);
                             });
        // For testing
        member.team_barrier();
      }
    }