static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar**,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> C){ Teuchos::BLAS<int,Scalar>blas; const int m = static_cast<int> (C.dimension_0 ()), n = static_cast<int> (C.dimension_1 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()); blas.GEMM(transB, transA, n, m, k, alpha, B.ptr_on_device(), n, A.ptr_on_device(), k, beta, C.ptr_on_device(), n); }
Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > > Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >:: replaceCommWithSubset (const Teuchos::RCP<const Teuchos::Comm<int> >& newComm) const { using Teuchos::ArrayView; using Teuchos::outArg; using Teuchos::RCP; using Teuchos::REDUCE_MIN; using Teuchos::reduceAll; typedef global_size_t GST; typedef LocalOrdinal LO; typedef GlobalOrdinal GO; typedef Map<LO, GO, node_type> map_type; // mfh 26 Mar 2013: The lazy way to do this is simply to recreate // the Map by calling its ordinary public constructor, using the // original Map's data. This only involves O(1) all-reduces over // the new communicator, which in the common case only includes a // small number of processes. // Create the Map to return. if (newComm.is_null ()) { return Teuchos::null; // my process does not participate in the new Map } else { // Map requires that the index base equal the global min GID. // Figuring out the global min GID requires a reduction over all // processes in the new communicator. It could be that some (or // even all) of these processes contain zero entries. (Recall // that this method, unlike removeEmptyProcesses(), may remove // an arbitrary subset of processes.) We deal with this by // doing a min over the min GID on each process if the process // has more than zero entries, or the global max GID, if that // process has zero entries. If no processes have any entries, // then the index base doesn't matter anyway. const GO myMinGid = (this->getNodeNumElements () == 0) ? this->getMaxAllGlobalIndex () : this->getMinGlobalIndex (); GO newIndexBase = this->getInvalidGlobalIndex (); reduceAll<int, GO> (*newComm, REDUCE_MIN, myMinGid, outArg (newIndexBase)); // Make Map's constructor compute the global number of indices. const GST globalNumInds = Teuchos::OrdinalTraits<GST>::invalid (); if (mapDevice_.initialized ()) { Kokkos::View<const GO*, DeviceType> myGIDs = mapDevice_.getMyGlobalIndices (); return rcp (new map_type (globalNumInds, myGIDs, newIndexBase, newComm, this->getNode ())); } else { Kokkos::View<const GO*, host_mirror_device_type> myGidsHostView = mapHost_.getMyGlobalIndices (); ArrayView<const GO> myGidsArrayView (myGidsHostView.ptr_on_device (), myGidsHostView.dimension_0 ()); return rcp (new map_type (globalNumInds, myGidsArrayView, newIndexBase, newComm, this->getNode ())); } } }
Teuchos::ArrayView<const GlobalOrdinal> Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >:: getNodeElementList () const { typedef GlobalOrdinal GO; Kokkos::View<const GO*, host_mirror_device_type> myGlobalInds = mapHost_.getMyGlobalIndices (); // creates it if it doesn't exist return Teuchos::ArrayView<const GO> (myGlobalInds.ptr_on_device (), myGlobalInds.dimension_0 ()); }
void multiply(const CrsMatrix< float , Kokkos::OpenMP >& A, const Kokkos::View< float* , Kokkos::OpenMP >& x, Kokkos::View< float* , Kokkos::OpenMP >& y, MKLMultiply tag) { MKL_INT n = A.graph.row_map.dimension_0() - 1 ; float *A_values = A.values.ptr_on_device() ; MKL_INT *col_indices = A.graph.entries.ptr_on_device() ; MKL_INT *row_beg = const_cast<MKL_INT*>(A.graph.row_map.ptr_on_device()) ; MKL_INT *row_end = row_beg+1; char matdescra[6] = { 'G', 'x', 'N', 'C', 'x', 'x' }; char trans = 'N'; float alpha = 1.0; float beta = 0.0; float *x_values = x.ptr_on_device() ; float *y_values = y.ptr_on_device() ; mkl_scsrmv(&trans, &n, &n, &alpha, matdescra, A_values, col_indices, row_beg, row_end, x_values, &beta, y_values); }
FieldContainer_Kokkos(Kokkos::View<ScalarPoindex_typeer,Kokkos::LayoutLeft,Kokkos::OpenMP>& InContainer){ dim0=dim[0]=InContainer.dimension(0); dim1=dim[1]=InContainer.dimension(1); dim2=dim[2]=InContainer.dimension(2); dim3=dim[3]=InContainer.dimension(3); dim4=dim[4]=InContainer.dimension(4); dim5=dim[5]=InContainer.dimension(5); dim6=dim[6]=InContainer.dimension(6); dim7=dim[7]=InContainer.dimension(7); rankValue=Kokkos::View<ScalarPoindex_typeer,Kokkos::LayoutLeft,Kokkos::OpenMP>::Rank; intepidManaged=false; switch(rankValue){ case 1: sizeValue=dim0; break; case 2: sizeValue=dim0*dim1; break; case 3: sizeValue=dim0*dim1*dim2; break; case 4: sizeValue=dim0*dim1*dim2*dim3; break; case 5: sizeValue=dim0*dim1*dim2*dim3*dim4; break; case 6: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5; break; case 7: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5*dim6; break; case 8: sizeValue=dim0*dim1*dim2*dim3*dim4*dim5*dim6*dim7; break; } containerMemory=InContainer.ptr_on_device(); }
KOKKOS_INLINE_FUNCTION void operator()( const typename policy_type::member_type ind , value_type & error ) const { if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) { const long int thread_count = ind.league_size() * ind.team_size(); total() = ( thread_count * ( thread_count + 1 ) ) / 2 ; } // Team max: const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() ); if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) { #ifndef __KALMAR_ACCELERATOR__ printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n" , ind.league_rank(), ind.team_rank() , ind.league_size(), ind.team_size() , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m ); #endif } // Scan: const long int answer = ( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ; const long int result = ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); const long int result2 = ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 ); if ( answer != result || answer != result2 ) { #ifndef __KALMAR_ACCELERATOR__ printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n", ind.league_rank(), ind.team_rank(), ind.league_size(), ind.team_size(), answer,result,result2); #endif error = 1 ; } const long int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank(); ind.team_scan( 1 + thread_rank , accum.ptr_on_device() ); }
SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end ) { thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin ) , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) ); }
SortView( const Kokkos::View<ValueType*,ExecSpace> v , int begin , int end ) { std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end ); }