TEST_F( KokkosThreads, SerialInitialize) { // allocate a rank 2 array witn that is RUN_TIME_DIMENSION x COMPILE_TIME_DIMENSION // View will default initialize all the values unless it is explicitly disabled, ie, // Kokkos::View<unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> a("node views", RUN_TIME_DIMENSION); // zero fills the array, but // Kokkos::View<unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> a( Kokkos::ViewAllocateWithoutInitializing("node views"), RUN_TIME_DIMENSION); // will allocate without initializing the array Kokkos::View<unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> a( Kokkos::ViewAllocateWithoutInitializing("node views"), RUN_TIME_DIMENSION); for (size_t i=0; i < a.dimension_0(); ++i) { for (size_t x=0; x < a.dimension_1(); ++x) { a(i,x) = i; } } // get a const view to the same array // this view shares the same memory as a, but cannot modify the values Kokkos::View<const unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> b = a; for (size_t i=0; i < b.dimension_0(); ++i) { for (size_t x=0; x < b.dimension_1(); ++x) { EXPECT_EQ(i, b(i,x)); } } }
TEST_F( KokkosThreads, LambdaInitialize) { Kokkos::View<unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> a( Kokkos::ViewAllocateWithoutInitializing("node views"), RUN_TIME_DIMENSION); Kokkos::parallel_for<KOKKOS_THREAD_DEVICE>( a.dimension_0() , [=](size_t i) { for (size_t x=0; x < a.dimension_1(); ++x) { a(i,x) = i; } } ); Kokkos::View<const unsigned*[COMPILE_TIME_DIMENSION], KOKKOS_THREAD_DEVICE> b = a; int num_error = 0; // Cannot portably call a GTEST macro in parallel // count the errors and test that they are equal to zero Kokkos::parallel_reduce<KOKKOS_THREAD_DEVICE, int /*reduction value type */>( b.dimension_0() , [](int & local_errors) // init lambda { local_errors = 0; } , [=](size_t i, int & local_errors) { // operator() lambda for (size_t x=0; x < b.dimension_1(); ++x) local_errors += i == b(i,x) ? 0 : 1; } , [](volatile int & dst_err, volatile int const& src_err) // join lambda { dst_err += src_err; } , num_errors // where to store the result ); EXPECT_EQ( 0, num_errors); }
Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > > Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >:: replaceCommWithSubset (const Teuchos::RCP<const Teuchos::Comm<int> >& newComm) const { using Teuchos::ArrayView; using Teuchos::outArg; using Teuchos::RCP; using Teuchos::REDUCE_MIN; using Teuchos::reduceAll; typedef global_size_t GST; typedef LocalOrdinal LO; typedef GlobalOrdinal GO; typedef Map<LO, GO, node_type> map_type; // mfh 26 Mar 2013: The lazy way to do this is simply to recreate // the Map by calling its ordinary public constructor, using the // original Map's data. This only involves O(1) all-reduces over // the new communicator, which in the common case only includes a // small number of processes. // Create the Map to return. if (newComm.is_null ()) { return Teuchos::null; // my process does not participate in the new Map } else { // Map requires that the index base equal the global min GID. // Figuring out the global min GID requires a reduction over all // processes in the new communicator. It could be that some (or // even all) of these processes contain zero entries. (Recall // that this method, unlike removeEmptyProcesses(), may remove // an arbitrary subset of processes.) We deal with this by // doing a min over the min GID on each process if the process // has more than zero entries, or the global max GID, if that // process has zero entries. If no processes have any entries, // then the index base doesn't matter anyway. const GO myMinGid = (this->getNodeNumElements () == 0) ? this->getMaxAllGlobalIndex () : this->getMinGlobalIndex (); GO newIndexBase = this->getInvalidGlobalIndex (); reduceAll<int, GO> (*newComm, REDUCE_MIN, myMinGid, outArg (newIndexBase)); // Make Map's constructor compute the global number of indices. const GST globalNumInds = Teuchos::OrdinalTraits<GST>::invalid (); if (mapDevice_.initialized ()) { Kokkos::View<const GO*, DeviceType> myGIDs = mapDevice_.getMyGlobalIndices (); return rcp (new map_type (globalNumInds, myGIDs, newIndexBase, newComm, this->getNode ())); } else { Kokkos::View<const GO*, host_mirror_device_type> myGidsHostView = mapHost_.getMyGlobalIndices (); ArrayView<const GO> myGidsArrayView (myGidsHostView.ptr_on_device (), myGidsHostView.dimension_0 ()); return rcp (new map_type (globalNumInds, myGidsArrayView, newIndexBase, newComm, this->getNode ())); } } }
static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> C){ const int m = static_cast<int> (C.dimension_1()), n = static_cast<int> (C.dimension_2 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_2 () : A.dimension_1 ()); // printf("m:%d,n:%d,k:%d",m,n,k); Kokkos::parallel_for(C.dimension(0),blasOpenMPBatchLeft<Scalar>(A,B,C,m,n,k,transA,transB,alpha,beta)); }
KOKKOS_INLINE_FUNCTION void operator() (int i) const { double tmp = 0.0; for(int j = 0; j < idx.dimension_1(); j++) { const double val = src(idx(i,j)); tmp += val*val + 0.5*(idx.dimension_0()*val -idx.dimension_1()*val); } dest(i) += tmp; }
Teuchos::ArrayView<const GlobalOrdinal> Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >:: getNodeElementList () const { typedef GlobalOrdinal GO; Kokkos::View<const GO*, host_mirror_device_type> myGlobalInds = mapHost_.getMyGlobalIndices (); // creates it if it doesn't exist return Teuchos::ArrayView<const GO> (myGlobalInds.ptr_on_device (), myGlobalInds.dimension_0 ()); }
static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> C){ const int m = static_cast<int> (C.dimension_1()), n = static_cast<int> (C.dimension_2 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_2 () : A.dimension_1 ()); Teuchos::BLAS<int,Scalar>blas; Kokkos::parallel_for(C.dimension_0(),KOKKOS_LAMBDA (const size_t i) { blas.GEMM(transB, transA, n, m, k, alpha, &B(i,0,0), n, &A(i,0,0), k, beta, &C(i,0,0), n); });
void NearestNeighborOperator<DeviceType>::apply( Kokkos::View<double const *, DeviceType> source_values, Kokkos::View<double *, DeviceType> target_values ) const { // Precondition: check that the source and target are properly sized DTK_REQUIRE( _indices.extent( 0 ) == target_values.extent( 0 ) ); DTK_REQUIRE( _size == source_values.extent_int( 0 ) ); auto values = Details::NearestNeighborOperatorImpl<DeviceType>::fetch( _comm, _ranks, _indices, source_values ); Kokkos::deep_copy( target_values, values ); }
FieldContainer_Kokkos(Kokkos::View<ScalarPoindex_typeer,Kokkos::LayoutLeft,Kokkos::OpenMP>& InContainer){ dim0=dim[0]=InContainer.dimension(0); dim1=dim[1]=InContainer.dimension(1); dim2=dim[2]=InContainer.dimension(2); dim3=dim[3]=InContainer.dimension(3); dim4=dim[4]=InContainer.dimension(4); dim5=dim[5]=InContainer.dimension(5); dim6=dim[6]=InContainer.dimension(6); dim7=dim[7]=InContainer.dimension(7); rankValue=Kokkos::View<ScalarPoindex_typeer,Kokkos::LayoutLeft,Kokkos::OpenMP>::Rank; intepidManaged=false; switch(rankValue){ case 1: sizeValue=dim0; break; case 2: sizeValue=dim0*dim1; break; case 3: sizeValue=dim0*dim1*dim2; break; case 4: sizeValue=dim0*dim1*dim2*dim3; break; case 5: sizeValue=dim0*dim1*dim2*dim3*dim4; break; case 6: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5; break; case 7: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5*dim6; break; case 8: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5*dim6*dim7; break; } containerMemory=InContainer.ptr_on_device(); }
KOKKOS_INLINE_FUNCTION NestedView & operator = ( const Kokkos::View<int*,Space> & lhs ) { member = lhs ; if ( member.dimension_0() ) Kokkos::atomic_add( & member(0) , 1 ); return *this ; }
KOKKOS_INLINE_FUNCTION ~NestedView() { if ( member.dimension_0() ) { Kokkos::atomic_add( & member(0) , -1 ); } }
KOKKOS_INLINE_FUNCTION void operator() ( const team_member & thread) const { int i = thread.league_rank(); // Allocate a shared array for the team. shared_1d_int count(thread.team_shmem(),data.dimension_1()); // With each team run a parallel_for with its threads Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) { int tsum; // Run a vector loop reduction over the inner dimension of data // Count how many values are multiples of 4 // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) { vsum+= (data(i,j,k) % 4 == 0)?1:0; },tsum); // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute // the next line only once PerThread Kokkos::single(Kokkos::PerThread(thread),[=] () { count(j) = tsum; }); }); // Wait for all threads to finish the parallel_for so that all shared memory writes are done thread.team_barrier(); // Check with one vector lane from each thread how many consecutive // data segments have the same number of values divisible by 4 // The team reduction value is again broadcast to every team member (and every vector lane) int team_sum = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) { // It is not valid to directly add to thread_sum // Use a single function with broadcast instead // team_sum will be used as input to the operator (i.e. it is used to initialize sum) // the end value of sum will be broadcast to all vector lanes in the thread. Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) { if(count(j)==count(j+1)) sum++; },thread_sum); },team_sum); // Add with one thread and vectorlane of the team the team_sum to the global value Kokkos::single(Kokkos::PerTeam(thread),[=] () { Kokkos::atomic_add(&gsum(),team_sum); }); }
void kk_inspector_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size, int vector_length) { typedef typename XType::non_const_value_type Scalar; typedef typename AType::execution_space execution_space; typedef KokkosSparse::CrsMatrix<const Scalar,int,execution_space,void,int> matrix_type ; typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space> y_type; typedef typename Kokkos::View<const Scalar*,Kokkos::LayoutLeft,execution_space,Kokkos::MemoryRandomAccess > x_type; //int rows_per_team = launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); //static int worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; static int worksets = std::is_same<Schedule,Kokkos::Static>::value ? team_size>0?execution_space::concurrency()/team_size:execution_space::concurrency() : //static team_size>0?execution_space::concurrency()*32/team_size:execution_space::concurrency()*32 ; //dynamic static Kokkos::View<int*> workset_offsets; if(workset_offsets.extent(0) == 0) { workset_offsets = Kokkos::View<int*> ("WorksetOffsets",worksets+1); const size_t nnz = A.nnz(); int nnz_per_workset = (nnz+worksets-1)/worksets; workset_offsets(0) = 0; int ws = 1; for(int row = 0; row<A.numRows(); row++) { if(A.graph.row_map(row) > ws*nnz_per_workset) { workset_offsets(ws) = row; ws++; } } if(workset_offsets(ws-1) < A.numRows()) { workset_offsets(ws) = A.numRows(); } printf("Worksets: %i %i\n",worksets,ws); worksets = ws; } double s_a = 1.0; double s_b = 0.0; SPMV_Inspector_Functor<matrix_type,x_type,y_type,0,false,int> func (s_a,A,x,workset_offsets,s_b,y); Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> > policy(1,1); if(team_size>0) policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,team_size,vector_length); else policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,Kokkos::AUTO,vector_length); Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy,func); }
/** Access the local IDs for an element. The local ordering is according to * the <code>getOwnedAndSharedIndices</code> method. Note */ void getElementLIDs(Kokkos::View<const int*,PHX::Device> cellIds, Kokkos::View<LocalOrdinalT**,PHX::Device> lids) const { CopyCellLIDsFunctor functor; functor.cellIds = cellIds; functor.global_lids = localIDs_k_; functor.local_lids = lids; // we assume this array is sized correctly! Kokkos::parallel_for(cellIds.dimension_0(),functor); }
void multiply(const CrsMatrix< float , Kokkos::OpenMP >& A, const Kokkos::View< float* , Kokkos::OpenMP >& x, Kokkos::View< float* , Kokkos::OpenMP >& y, MKLMultiply tag) { MKL_INT n = A.graph.row_map.dimension_0() - 1 ; float *A_values = A.values.ptr_on_device() ; MKL_INT *col_indices = A.graph.entries.ptr_on_device() ; MKL_INT *row_beg = const_cast<MKL_INT*>(A.graph.row_map.ptr_on_device()) ; MKL_INT *row_end = row_beg+1; char matdescra[6] = { 'G', 'x', 'N', 'C', 'x', 'x' }; char trans = 'N'; float alpha = 1.0; float beta = 0.0; float *x_values = x.ptr_on_device() ; float *y_values = y.ptr_on_device() ; mkl_scsrmv(&trans, &n, &n, &alpha, matdescra, A_values, col_indices, row_beg, row_end, x_values, &beta, y_values); }
NearestNeighborOperator<DeviceType>::NearestNeighborOperator( MPI_Comm comm, Kokkos::View<Coordinate const **, DeviceType> source_points, Kokkos::View<Coordinate const **, DeviceType> target_points ) : _comm( comm ) , _indices( "indices" ) , _ranks( "ranks" ) , _size( source_points.extent_int( 0 ) ) { // NOTE: instead of checking the pre-condition that there is at least one // source point passed to one of the rank, we let the tree handle the // communication and just check that the tree is not empty. // Build distributed search tree over the source points. DistributedSearchTree<DeviceType> search_tree( _comm, source_points ); // Tree must have at least one leaf, otherwise it makes little sense to // perform the search for nearest neighbors. DTK_CHECK( !search_tree.empty() ); // Query nearest neighbor for all target points. auto nearest_queries = Details::NearestNeighborOperatorImpl< DeviceType>::makeNearestNeighborQueries( target_points ); // Perform the actual search. Kokkos::View<int *, DeviceType> indices( "indices" ); Kokkos::View<int *, DeviceType> offset( "offset" ); Kokkos::View<int *, DeviceType> ranks( "ranks" ); search_tree.query( nearest_queries, indices, offset, ranks ); // Check post-condition that we did find a nearest neighbor to all target // points. DTK_ENSURE( lastElement( offset ) == target_points.extent_int( 0 ) ); // Save results. // NOTE: we don't bother keeping `offset` around since it is just `[0, 1, 2, // ..., n_target_poins]` _indices = indices; _ranks = ranks; }
static Kokkos::View<size_t *, DeviceType> sortQueriesAlongZOrderCurve( Box const &scene_bounding_box, Kokkos::View<Query *, DeviceType> queries ) { auto const n_queries = queries.extent( 0 ); Kokkos::View<unsigned int *, DeviceType> morton_codes( Kokkos::ViewAllocateWithoutInitializing( "morton" ), n_queries ); Kokkos::parallel_for( ARBORX_MARK_REGION( "assign_morton_codes_to_queries" ), Kokkos::RangePolicy<ExecutionSpace>( 0, n_queries ), KOKKOS_LAMBDA( int i ) { Point xyz = Details::returnCentroid( queries( i )._geometry ); translateAndScale( xyz, xyz, scene_bounding_box ); morton_codes( i ) = morton3D( xyz[0], xyz[1], xyz[2] ); } );
void pointInCell( double threshold, Kokkos::View<Coordinate **, DeviceType> physical_points, Kokkos::View<Coordinate ***, DeviceType> cells, Kokkos::View<int *, DeviceType> coarse_search_output_cells, Kokkos::View<Coordinate **, DeviceType> reference_points, Kokkos::View<bool *, DeviceType> point_in_cell ) { using ExecutionSpace = typename DeviceType::execution_space; int const n_ref_pts = reference_points.extent( 0 ); Functor::PointInCell<CellType, DeviceType> search_functor( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); Kokkos::parallel_for( DTK_MARK_REGION( "point_in_cell" ), Kokkos::RangePolicy<ExecutionSpace>( 0, n_ref_pts ), search_functor ); }
static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> C){ Teuchos::BLAS<int,Scalar>blas; const int m = static_cast<int> (C.dimension_0 ()), n = static_cast<int> (C.dimension_1 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()); blas.GEMM(transB, transA, n, m, k, alpha, B.ptr_on_device(), n, A.ptr_on_device(), k, beta, C.ptr_on_device(), n); }
KOKKOS_INLINE_FUNCTION void operator()( size_t i ) const { if ( i < m_elem_node.dimension_0() * m_elem_node.dimension_1() ) { const size_t ielem = i / ElemNode ; const size_t inode = i % ElemNode ; unsigned elem_grid[SpaceDim] ; unsigned node_grid[SpaceDim] ; m_box_part.uses_elem_coord( ielem , elem_grid ); enum { elem_node_scale = Order == BoxElemPart::ElemLinear ? 1 : Order == BoxElemPart::ElemQuadratic ? 2 : 0 }; node_grid[0] = elem_node_scale * elem_grid[0] + m_elem_node_local[inode][0] ; node_grid[1] = elem_node_scale * elem_grid[1] + m_elem_node_local[inode][1] ; node_grid[2] = elem_node_scale * elem_grid[2] + m_elem_node_local[inode][2] ; m_elem_node(ielem,inode) = m_box_part.local_node_id( node_grid ); } if ( i < m_node_grid.dimension_0() ) { unsigned node_grid[SpaceDim] ; m_box_part.local_node_coord( i , node_grid ); m_node_grid(i,0) = node_grid[0] ; m_node_grid(i,1) = node_grid[1] ; m_node_grid(i,2) = node_grid[2] ; m_coord_map( node_grid[0] , node_grid[1] , node_grid[2] , m_node_coord(i,0) , m_node_coord(i,1) , m_node_coord(i,2) ); } if ( i < m_recv_node.dimension_0() ) { m_recv_node(i,0) = m_box_part.recv_node_rank(i); m_recv_node(i,1) = m_box_part.recv_node_count(i); } if ( i < m_send_node.dimension_0() ) { m_send_node(i,0) = m_box_part.send_node_rank(i); m_send_node(i,1) = m_box_part.send_node_count(i); } if ( i < m_send_node_id.dimension_0() ) { m_send_node_id(i) = m_box_part.send_node_id(i); } }
KOKKOS_INLINE_FUNCTION void operator()( const typename policy_type::member_type ind , value_type & error ) const { if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) { const long int thread_count = ind.league_size() * ind.team_size(); total() = ( thread_count * ( thread_count + 1 ) ) / 2 ; } // Team max: const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() ); if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) { #ifndef __KALMAR_ACCELERATOR__ printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n" , ind.league_rank(), ind.team_rank() , ind.league_size(), ind.team_size() , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m ); #endif } // Scan: const long int answer = ( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ; const long int result = ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); const long int result2 = ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); if ( answer != result || answer != result2 ) { #ifndef __KALMAR_ACCELERATOR__ printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n", ind.league_rank(), ind.team_rank(), ind.league_size(), ind.team_size(), answer,result,result2); #endif error = 1 ; } const long int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank(); ind.team_scan( 1 + thread_rank , accum.ptr_on_device() ); }
BoxElemFixture( const BoxElemPart::Decompose decompose , const unsigned global_size , const unsigned global_rank , const unsigned elem_nx , const unsigned elem_ny , const unsigned elem_nz , const float bubble_x = 1.1f , const float bubble_y = 1.2f , const float bubble_z = 1.3f ) : m_box_part( Order , decompose , global_size , global_rank , elem_nx , elem_ny , elem_nz ) , m_coord_map( m_box_part.global_coord_max(0) , m_box_part.global_coord_max(1) , m_box_part.global_coord_max(2) , bubble_x , bubble_y , bubble_z ) , m_node_coord( "fixture_node_coord" , m_box_part.uses_node_count() ) , m_node_grid( "fixture_node_grid" , m_box_part.uses_node_count() ) , m_elem_node( "fixture_elem_node" , m_box_part.uses_elem_count() ) , m_recv_node( "fixture_recv_node" , m_box_part.recv_node_msg_count() ) , m_send_node( "fixture_send_node" , m_box_part.send_node_msg_count() ) , m_send_node_id( "fixture_send_node_id" , m_box_part.send_node_id_count() ) { { const hex_data elem_data ; for ( unsigned i = 0 ; i < ElemNode ; ++i ) { m_elem_node_local[i][0] = elem_data.eval_map[i][0] ; m_elem_node_local[i][1] = elem_data.eval_map[i][1] ; m_elem_node_local[i][2] = elem_data.eval_map[i][2] ; m_elem_node_local[i][3] = 0 ; } } const size_t nwork = std::max( m_recv_node.dimension_0() , std::max( m_send_node.dimension_0() , std::max( m_send_node_id.dimension_0() , std::max( m_node_grid.dimension_0() , m_elem_node.dimension_0() * m_elem_node.dimension_1() )))); Kokkos::parallel_for( nwork , *this ); }
void modified_gram_schmidt( const Kokkos::View< ScalarQ ** , Kokkos::LayoutLeft , DeviceType , Management > & Q , const Kokkos::View< ScalarR ** , Kokkos::LayoutLeft , DeviceType , Management > & R , comm::Machine machine ) { const Kokkos::ALL ALL ; typedef Kokkos::View< ScalarQ * , Kokkos::LayoutLeft , DeviceType , Kokkos::MemoryUnmanaged > vector_view_type ; const typename Kokkos::View< ScalarR** , Kokkos::LayoutLeft , DeviceType >:: HostMirror hostR = Kokkos::create_mirror_view( R ); const int length = Q.dimension_0(); const int count = Q.dimension_1(); for ( int j = 0 ; j < count ; ++j ) { const vector_view_type Qj = Kokkos::subview< vector_view_type >( Q , ALL , j ); // reads += length // writes += 0 // flops += 1 + 2 * length const double norm_Qj = Kokkos::norm2( length , Qj , machine ); hostR(j,j) = norm_Qj ; // reads += length // writes += length // flops += 1 + length Kokkos::scale( length , 1.0 / norm_Qj , Qj ); for ( int k = j + 1 ; k < count ; ++k ) { const vector_view_type Qk = Kokkos::subview< vector_view_type >( Q , ALL , k ); // reads += 2 * length // writes += 0 // flops += 2 * length const double Qj_dot_Qk = Kokkos::dot( length , Qj , Qk , machine ); hostR(j,k) = Qj_dot_Qk ; // reads += 2 * length // writes += length // flops += 2 * length Kokkos::axpy( length , - Qj_dot_Qk , Qj , Qk ); } } // reads += 0 // writes += count * count Kokkos::deep_copy( R , hostR ); }
void PointInCell<DeviceType>::search( Kokkos::View<Coordinate **, DeviceType> physical_points, Kokkos::View<Coordinate ***, DeviceType> cells, Kokkos::View<int *, DeviceType> coarse_search_output_cells, DTK_CellTopology cell_topo, Kokkos::View<Coordinate **, DeviceType> reference_points, Kokkos::View<bool *, DeviceType> point_in_cell ) { // Check the size of the Views DTK_REQUIRE( reference_points.extent( 0 ) == point_in_cell.extent( 0 ) ); DTK_REQUIRE( reference_points.extent( 0 ) == physical_points.extent( 0 ) ); DTK_REQUIRE( reference_points.extent( 1 ) == physical_points.extent( 1 ) ); DTK_REQUIRE( reference_points.extent( 1 ) == cells.extent( 2 ) ); // Perform the point in cell search. We hide the template parameters used by // Intrepid2, using the CellType template. // Note that if the Newton solver does not converge, Intrepid2 will just // return the last results and there is no way to know that the coordinates // in the reference frames where not found. switch ( cell_topo ) { case DTK_HEX_8: { internal::pointInCell<HEX_8, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_HEX_27: { internal::pointInCell<HEX_27, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_PYRAMID_5: { internal::pointInCell<PYRAMID_5, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_QUAD_4: { internal::pointInCell<QUAD_4, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_QUAD_9: { internal::pointInCell<QUAD_9, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_TET_4: { internal::pointInCell<TET_4, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_TET_10: { internal::pointInCell<TET_10, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_TRI_3: { internal::pointInCell<TRI_3, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_TRI_6: { internal::pointInCell<TRI_6, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_WEDGE_6: { internal::pointInCell<WEDGE_6, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } case DTK_WEDGE_18: { internal::pointInCell<WEDGE_18, DeviceType>( threshold, physical_points, cells, coarse_search_output_cells, reference_points, point_in_cell ); break; } default: { throw DataTransferKitNotImplementedException(); } } Kokkos::fence(); }
KOKKOS_INLINE_FUNCTION unsigned node_count() const { return m_node_grid.dimension_0(); }
KOKKOS_INLINE_FUNCTION unsigned elem_count() const { return m_elem_node.dimension_0(); }
KOKKOS_INLINE_FUNCTION void operator()(const int cell) const { for(int i=0;i<Teuchos::as<int>(local_lids.dimension_1());i++) local_lids(cell,i) = global_lids(cellIds(cell),i); }
// The functor needs to define how much shared memory it requests given a team_size. size_t team_shmem_size( int team_size ) const { return shared_1d_int::shmem_size(data.dimension_1()); }
SortView( const Kokkos::View<ValueType*,ExecSpace> v , int begin , int end ) { std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end ); }
SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end ) { thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin ) , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) ); }