KOKKOS_INLINE_FUNCTION
  void operator() ( const team_member & thread) const {
    int i = thread.league_rank();

    // Allocate a shared array for the team.
    shared_1d_int count(thread.team_shmem(),data.extent(1));

    // With each team run a parallel_for with its threads
    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.extent(1)), [=] (const int& j) {
      int tsum;
      // Run a vector loop reduction over the inner dimension of data
      // Count how many values are multiples of 4
      // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.extent(2)), [=] (const int& k, int & vsum) {
        vsum+= (data(i,j,k) % 4 == 0)?1:0;
      },tsum);

      // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
      // the next line only once PerThread
      Kokkos::single(Kokkos::PerThread(thread),[=] () {
        count(j) = tsum;
      });
    });

    // Wait for all threads to finish the parallel_for so that all shared memory writes are done
    thread.team_barrier();

    // Check with one vector lane from each thread how many consecutive
    // data segments have the same number of values divisible by 4
    // The team reduction value is again broadcast to every team member (and every vector lane)
    int team_sum = 0;
    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.extent(1)-1), [=] (const int& j, int& thread_sum) {
      // It is not valid to directly add to thread_sum
      // Use a single function with broadcast instead
      // team_sum will be used as input to the operator (i.e. it is used to initialize sum)
      // the end value of sum will be broadcast to all vector lanes in the thread.
      Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
        if(count(j)==count(j+1)) sum++;
      },thread_sum);
    },team_sum);

    // Add with one thread and vectorlane of the team the team_sum to the global value
    Kokkos::single(Kokkos::PerTeam(thread),[=] () {
      Kokkos::atomic_add(&gsum(),team_sum);
    });
  }
void NearestNeighborOperator<DeviceType>::apply(
    Kokkos::View<double const *, DeviceType> source_values,
    Kokkos::View<double *, DeviceType> target_values ) const
{
    // Precondition: check that the source and target are properly sized
    DTK_REQUIRE( _indices.extent( 0 ) == target_values.extent( 0 ) );
    DTK_REQUIRE( _size == source_values.extent_int( 0 ) );

    auto values = Details::NearestNeighborOperatorImpl<DeviceType>::fetch(
        _comm, _ranks, _indices, source_values );

    Kokkos::deep_copy( target_values, values );
}
    static Kokkos::View<size_t *, DeviceType>
    sortQueriesAlongZOrderCurve( Box const &scene_bounding_box,
                                 Kokkos::View<Query *, DeviceType> queries )
    {
        auto const n_queries = queries.extent( 0 );

        Kokkos::View<unsigned int *, DeviceType> morton_codes(
            Kokkos::ViewAllocateWithoutInitializing( "morton" ), n_queries );
        Kokkos::parallel_for(
            ARBORX_MARK_REGION( "assign_morton_codes_to_queries" ),
            Kokkos::RangePolicy<ExecutionSpace>( 0, n_queries ),
            KOKKOS_LAMBDA( int i ) {
                Point xyz = Details::returnCentroid( queries( i )._geometry );
                translateAndScale( xyz, xyz, scene_bounding_box );
                morton_codes( i ) = morton3D( xyz[0], xyz[1], xyz[2] );
            } );
void pointInCell( double threshold,
                  Kokkos::View<Coordinate **, DeviceType> physical_points,
                  Kokkos::View<Coordinate ***, DeviceType> cells,
                  Kokkos::View<int *, DeviceType> coarse_search_output_cells,
                  Kokkos::View<Coordinate **, DeviceType> reference_points,
                  Kokkos::View<bool *, DeviceType> point_in_cell )
{
    using ExecutionSpace = typename DeviceType::execution_space;
    int const n_ref_pts = reference_points.extent( 0 );

    Functor::PointInCell<CellType, DeviceType> search_functor(
        threshold, physical_points, cells, coarse_search_output_cells,
        reference_points, point_in_cell );
    Kokkos::parallel_for( DTK_MARK_REGION( "point_in_cell" ),
                          Kokkos::RangePolicy<ExecutionSpace>( 0, n_ref_pts ),
                          search_functor );
}
void kk_inspector_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size, int vector_length) {

  typedef typename XType::non_const_value_type Scalar;
  typedef typename AType::execution_space execution_space;
  typedef KokkosSparse::CrsMatrix<const Scalar,int,execution_space,void,int> matrix_type ;
  typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space> y_type;
  typedef typename Kokkos::View<const Scalar*,Kokkos::LayoutLeft,execution_space,Kokkos::MemoryRandomAccess > x_type;

  //int rows_per_team = launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
  //static int worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
  static int worksets = std::is_same<Schedule,Kokkos::Static>::value ?
                        team_size>0?execution_space::concurrency()/team_size:execution_space::concurrency() : //static
                        team_size>0?execution_space::concurrency()*32/team_size:execution_space::concurrency()*32 ; //dynamic
  static Kokkos::View<int*> workset_offsets;
  if(workset_offsets.extent(0) == 0) {
    workset_offsets = Kokkos::View<int*> ("WorksetOffsets",worksets+1);
    const size_t nnz = A.nnz();
    int nnz_per_workset = (nnz+worksets-1)/worksets;
    workset_offsets(0) = 0;
    int ws = 1;
    for(int row = 0; row<A.numRows(); row++) {
      if(A.graph.row_map(row) > ws*nnz_per_workset) {
        workset_offsets(ws) = row;
        ws++;
      }
    }
    if(workset_offsets(ws-1) < A.numRows()) {
      workset_offsets(ws) = A.numRows();
    }
    printf("Worksets: %i %i\n",worksets,ws);
    worksets = ws;
  }
  double s_a = 1.0;
  double s_b = 0.0;
  SPMV_Inspector_Functor<matrix_type,x_type,y_type,0,false,int> func (s_a,A,x,workset_offsets,s_b,y);

  Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> > policy(1,1);

  if(team_size>0)
    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,team_size,vector_length);
  else
    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,Kokkos::AUTO,vector_length);

  Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy,func);
}
void PointInCell<DeviceType>::search(
    Kokkos::View<Coordinate **, DeviceType> physical_points,
    Kokkos::View<Coordinate ***, DeviceType> cells,
    Kokkos::View<int *, DeviceType> coarse_search_output_cells,
    DTK_CellTopology cell_topo,
    Kokkos::View<Coordinate **, DeviceType> reference_points,
    Kokkos::View<bool *, DeviceType> point_in_cell )
{
    // Check the size of the Views
    DTK_REQUIRE( reference_points.extent( 0 ) == point_in_cell.extent( 0 ) );
    DTK_REQUIRE( reference_points.extent( 0 ) == physical_points.extent( 0 ) );
    DTK_REQUIRE( reference_points.extent( 1 ) == physical_points.extent( 1 ) );
    DTK_REQUIRE( reference_points.extent( 1 ) == cells.extent( 2 ) );

    // Perform the point in cell search. We hide the template parameters used by
    // Intrepid2, using the CellType template.
    // Note that if the Newton solver does not converge, Intrepid2 will just
    // return the last results and there is no way to know that the coordinates
    // in the reference frames where not found.
    switch ( cell_topo )
    {
    case DTK_HEX_8:
    {
        internal::pointInCell<HEX_8, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_HEX_27:
    {
        internal::pointInCell<HEX_27, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_PYRAMID_5:
    {
        internal::pointInCell<PYRAMID_5, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_QUAD_4:
    {
        internal::pointInCell<QUAD_4, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_QUAD_9:
    {
        internal::pointInCell<QUAD_9, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_TET_4:
    {
        internal::pointInCell<TET_4, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_TET_10:
    {
        internal::pointInCell<TET_10, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_TRI_3:
    {
        internal::pointInCell<TRI_3, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_TRI_6:
    {
        internal::pointInCell<TRI_6, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_WEDGE_6:
    {
        internal::pointInCell<WEDGE_6, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    case DTK_WEDGE_18:
    {
        internal::pointInCell<WEDGE_18, DeviceType>(
            threshold, physical_points, cells, coarse_search_output_cells,
            reference_points, point_in_cell );
        break;
    }
    default:
    {
        throw DataTransferKitNotImplementedException();
    }
    }
    Kokkos::fence();
}
 // The functor needs to define how much shared memory it requests given a team_size.
 size_t team_shmem_size( int team_size ) const {
   return shared_1d_int::shmem_size(data.extent(1));
 }
TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( MeshGenerator, structured, DeviceType )
{
    MPI_Comm comm = MPI_COMM_WORLD;
    Kokkos::View<DTK_CellTopology *, DeviceType> cell_topologies_view;
    Kokkos::View<unsigned int *, DeviceType> cells;
    Kokkos::View<double **, DeviceType> coordinates;

    // 2D test
    std::string filename = "structured_2d.txt";
    std::vector<std::vector<DataTransferKit::Coordinate>> coordinates_ref;
    std::vector<unsigned int> cells_ref;
    std::tie( coordinates_ref, cells_ref ) = readInputFile( filename );
    // Move mesh according to the rank
    int comm_rank;
    MPI_Comm_rank( comm, &comm_rank );
    std::vector<unsigned int> n_subdivisions = {{4, 3}};
    double offset = n_subdivisions[1] * comm_rank;
    for ( auto &coord : coordinates_ref )
        coord[1] += offset;

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildStructuredMesh<DeviceType>( comm, n_subdivisions );

    // Check view size
    unsigned int n_vertices = coordinates_ref.size();
    unsigned int n_cells = 1;
    for ( auto n_sub : n_subdivisions )
        n_cells *= n_sub;
    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), cells_ref.size() );
    TEST_EQUALITY( coordinates.extent( 0 ), n_vertices );

    // Check topology
    auto cell_topologies_view_host =
        Kokkos::create_mirror_view( cell_topologies_view );
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), DTK_QUAD_4 );

    // Check cells
    auto cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    TEST_COMPARE_ARRAYS( cells_host, cells_ref );

    // Check coordinates
    unsigned int dim = 2;
    auto coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    for ( unsigned int i = 0; i < n_vertices; ++i )
        for ( unsigned int j = 0; j < dim; ++j )
            TEST_EQUALITY( coordinates_host( i, j ), coordinates_ref[i][j] );

    // 3D test
    filename = "structured_3d.txt";
    std::tie( coordinates_ref, cells_ref ) = readInputFile( filename );
    // Move mesh according to the rank
    n_subdivisions = {{2, 3, 4}};
    offset = n_subdivisions[2] * comm_rank;
    for ( auto &coord : coordinates_ref )
        coord[2] += offset;

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildStructuredMesh<DeviceType>( comm, n_subdivisions );

    n_vertices = coordinates_ref.size();
    n_cells = 1;
    for ( auto n_sub : n_subdivisions )
        n_cells *= n_sub;

    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), cells_ref.size() );
    TEST_EQUALITY( coordinates.extent( 0 ), n_vertices );

    // Check topology
    cell_topologies_view_host =
        Kokkos::create_mirror_view( cell_topologies_view );
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), DTK_HEX_8 );

    // Check cells
    cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    TEST_COMPARE_ARRAYS( cells_host, cells_ref );

    // Check coordinates
    dim = 3;
    coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    for ( unsigned int i = 0; i < n_vertices; ++i )
        for ( unsigned int j = 0; j < dim; ++j )
            TEST_EQUALITY( coordinates_host( i, j ), coordinates_ref[i][j] );
}
TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( MeshGenerator, simplex, DeviceType )
{
    MPI_Comm comm = MPI_COMM_WORLD;

    Kokkos::View<DTK_CellTopology *, DeviceType> cell_topologies_view;
    Kokkos::View<unsigned int *, DeviceType> cells;
    Kokkos::View<double **, DeviceType> coordinates;

    // 2D test
    std::vector<unsigned int> n_subdivisions = {{4, 3}};
    unsigned int n_cells = 2;
    for ( auto n_sub : n_subdivisions )
        n_cells *= n_sub;
    unsigned int constexpr dim_2 = 2;
    unsigned int constexpr n_vertices_per_tri = 3;
    int comm_rank;
    MPI_Comm_rank( comm, &comm_rank );
    double offset = comm_rank * n_subdivisions[1];
    std::vector<std::array<std::array<double, dim_2>, n_vertices_per_tri>>
        tri_mesh_ref( n_cells );
    unsigned int n = 0;
    for ( unsigned int i = 0; i < n_subdivisions[1]; ++i )
        for ( unsigned int j = 0; j < n_subdivisions[0]; ++j )
        {
            tri_mesh_ref[n][0][0] = j;
            tri_mesh_ref[n][0][1] = i + offset;

            tri_mesh_ref[n][1][0] = j + 1;
            tri_mesh_ref[n][1][1] = i + offset;

            tri_mesh_ref[n][2][0] = j;
            tri_mesh_ref[n][2][1] = i + 1 + offset;

            ++n;

            tri_mesh_ref[n][0][0] = j + 1;
            tri_mesh_ref[n][0][1] = i + offset;

            tri_mesh_ref[n][1][0] = j + 1;
            tri_mesh_ref[n][1][1] = i + 1 + offset;

            tri_mesh_ref[n][2][0] = j;
            tri_mesh_ref[n][2][1] = i + 1 + offset;

            ++n;
        }

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildSimplexMesh<DeviceType>( comm, n_subdivisions );
    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), n_cells * n_vertices_per_tri );

    auto cell_topologies_view_host =
        Kokkos::create_mirror_view( cell_topologies_view );
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), DTK_TRI_3 );

    auto cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    auto coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    n = 0;
    for ( unsigned int i = 0; i < n_cells; ++i )
    {
        for ( unsigned int j = 0; j < n_vertices_per_tri; ++j )
        {
            for ( unsigned int k = 0; k < dim_2; ++k )
            {
                unsigned int coord_pos = cells_host( n );
                TEST_EQUALITY( coordinates_host( coord_pos, k ),
                               tri_mesh_ref[i][j][k] );
            }

            ++n;
        }
    }

    // 3D test
    n_subdivisions = {{2, 2, 2}};
    n_cells = 5;
    for ( auto n_sub : n_subdivisions )
        n_cells *= n_sub;
    unsigned int constexpr n_vertices_per_tet = 4;
    offset = comm_rank * n_subdivisions[2];

    unsigned int constexpr dim_3 = 3;
    std::vector<std::array<std::array<double, dim_3>, n_vertices_per_tet>>
        tet_mesh_ref( n_cells );
    n = 0;
    for ( unsigned int i = 0; i < n_subdivisions[2]; i += 2 )
        for ( unsigned int j = 0; j < n_subdivisions[1]; ++j )
            for ( unsigned int k = 0; k < n_subdivisions[0]; ++k )
            {
                // First tet
                tet_mesh_ref[n][0][0] = k;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j;
                tet_mesh_ref[n][1][2] = i + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j + 1;
                tet_mesh_ref[n][2][2] = i + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Second tet
                tet_mesh_ref[n][0][0] = k + 1;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j + 1;
                tet_mesh_ref[n][2][2] = i + offset;

                tet_mesh_ref[n][3][0] = k + 1;
                tet_mesh_ref[n][3][1] = j + 1;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Third tet
                tet_mesh_ref[n][0][0] = k + 1;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j + 1;
                tet_mesh_ref[n][2][2] = i + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Fourth tet
                tet_mesh_ref[n][0][0] = k;
                tet_mesh_ref[n][0][1] = j + 1;
                tet_mesh_ref[n][0][2] = i + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k + 1;
                tet_mesh_ref[n][2][1] = j + 1;
                tet_mesh_ref[n][2][2] = i + 1 + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j + 1;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Fifth tet
                tet_mesh_ref[n][0][0] = k + 1;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 1 + offset;

                tet_mesh_ref[n][3][0] = k + 1;
                tet_mesh_ref[n][3][1] = j;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Sixth tet
                tet_mesh_ref[n][0][0] = k + 1;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + 1 + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 1 + offset;

                tet_mesh_ref[n][3][0] = k + 1;
                tet_mesh_ref[n][3][1] = j;
                tet_mesh_ref[n][3][2] = i + 2 + offset;

                ++n;

                // Seventh tet
                tet_mesh_ref[n][0][0] = k + 1;
                tet_mesh_ref[n][0][1] = j + 1;
                tet_mesh_ref[n][0][2] = i + 1 + offset;

                tet_mesh_ref[n][1][0] = k;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 1 + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j + 1;
                tet_mesh_ref[n][3][2] = i + 2 + offset;

                ++n;

                // Eighth tet
                tet_mesh_ref[n][0][0] = k;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + 1 + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 1 + offset;

                tet_mesh_ref[n][2][0] = k + 1;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 2 + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j + 1;
                tet_mesh_ref[n][3][2] = i + 2 + offset;

                ++n;

                // Nineth tet
                tet_mesh_ref[n][0][0] = k;
                tet_mesh_ref[n][0][1] = j;
                tet_mesh_ref[n][0][2] = i + 2 + offset;

                tet_mesh_ref[n][1][0] = k;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 2 + offset;

                tet_mesh_ref[n][2][0] = k + 1;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 2 + offset;

                tet_mesh_ref[n][3][0] = k;
                tet_mesh_ref[n][3][1] = j;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;

                // Tenth tet
                tet_mesh_ref[n][0][0] = k;
                tet_mesh_ref[n][0][1] = j + 1;
                tet_mesh_ref[n][0][2] = i + 2 + offset;

                tet_mesh_ref[n][1][0] = k + 1;
                tet_mesh_ref[n][1][1] = j + 1;
                tet_mesh_ref[n][1][2] = i + 2 + offset;

                tet_mesh_ref[n][2][0] = k + 1;
                tet_mesh_ref[n][2][1] = j;
                tet_mesh_ref[n][2][2] = i + 2 + offset;

                tet_mesh_ref[n][3][0] = k + 1;
                tet_mesh_ref[n][3][1] = j + 1;
                tet_mesh_ref[n][3][2] = i + 1 + offset;

                ++n;
            }

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildSimplexMesh<DeviceType>( comm, n_subdivisions );
    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), n_cells * n_vertices_per_tet );

    cell_topologies_view_host =
        Kokkos::create_mirror_view( cell_topologies_view );
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), DTK_TET_4 );

    cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    n = 0;
    for ( unsigned int i = 0; i < n_cells; ++i )
    {
        for ( unsigned int j = 0; j < tet_mesh_ref[i].size(); ++j )
        {
            for ( unsigned int k = 0; k < dim_3; ++k )
            {
                unsigned int coord_pos = cells_host( n );
                TEST_EQUALITY( coordinates_host( coord_pos, k ),
                               tet_mesh_ref[i][j][k] );
            }
            ++n;
        }
    }
}
TEUCHOS_UNIT_TEST_TEMPLATE_1_DECL( MeshGenerator, mixed, DeviceType )
{
    MPI_Comm comm = MPI_COMM_WORLD;

    Kokkos::View<DTK_CellTopology *, DeviceType> cell_topologies_view;
    Kokkos::View<unsigned int *, DeviceType> cells;
    Kokkos::View<double **, DeviceType> coordinates;

    // 2D test
    std::string filename = "mixed_2d.txt";
    std::vector<std::vector<DataTransferKit::Coordinate>> coordinates_ref;
    std::vector<unsigned int> cells_ref;
    std::tie( coordinates_ref, cells_ref ) = readInputFile( filename );
    unsigned int dim = 2;
    // Move mesh according to the rank
    int comm_rank;
    MPI_Comm_rank( comm, &comm_rank );
    double offset = 3. * comm_rank;
    for ( auto &coord : coordinates_ref )
        coord[0] += offset;

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildMixedMesh<DeviceType>( comm, dim );

    // Check view size
    unsigned int n_vertices = coordinates_ref.size();
    unsigned int n_cells = 6;
    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), cells_ref.size() );
    TEST_EQUALITY( coordinates.extent( 0 ), n_vertices );

    // Check topology
    auto cell_topologies_view_host =
        Kokkos::create_mirror_view( cell_topologies_view );
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    std::vector<DTK_CellTopology> cell_topology_ref = {
        {DTK_QUAD_4, DTK_TRI_3, DTK_QUAD_4, DTK_QUAD_4, DTK_TRI_3, DTK_QUAD_4}};
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), cell_topology_ref[i] );

    // Check cells
    auto cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    TEST_COMPARE_ARRAYS( cells_host, cells_ref );

    // Check coordinates
    auto coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    for ( unsigned int i = 0; i < n_vertices; ++i )
        for ( unsigned int j = 0; j < dim; ++j )
            TEST_EQUALITY( coordinates_host( i, j ), coordinates_ref[i][j] );

    // 3D test
    filename = "mixed_3d.txt";
    std::tie( coordinates_ref, cells_ref ) = readInputFile( filename );
    dim = 3;
    // Move mesh according to the rank
    for ( auto &coord : coordinates_ref )
        coord[0] += offset;

    std::tie( cell_topologies_view, cells, coordinates ) =
        buildMixedMesh<DeviceType>( comm, dim );

    // Check view size
    n_vertices = coordinates_ref.size();
    TEST_EQUALITY( cell_topologies_view.extent( 0 ), n_cells );
    TEST_EQUALITY( cells.extent( 0 ), cells_ref.size() );
    TEST_EQUALITY( coordinates.extent( 0 ), n_vertices );

    // Check topology
    Kokkos::deep_copy( cell_topologies_view_host, cell_topologies_view );
    cell_topology_ref = {
        {DTK_HEX_8, DTK_TET_4, DTK_HEX_8, DTK_HEX_8, DTK_TET_4, DTK_HEX_8}};
    for ( unsigned int i = 0; i < n_cells; ++i )
        TEST_EQUALITY( cell_topologies_view_host( i ), cell_topology_ref[i] );

    // Check cells
    cells_host = Kokkos::create_mirror_view( cells );
    Kokkos::deep_copy( cells_host, cells );
    TEST_COMPARE_ARRAYS( cells_host, cells_ref );

    // Check coordinates
    coordinates_host = Kokkos::create_mirror_view( coordinates );
    Kokkos::deep_copy( coordinates_host, coordinates );
    for ( unsigned int i = 0; i < n_vertices; ++i )
        for ( unsigned int j = 0; j < dim; ++j )
            TEST_EQUALITY( coordinates_host( i, j ), coordinates_ref[i][j] );
}