static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar***,Kokkos::LayoutLeft,Kokkos::DefaultExecutionSpace> C){ const int m = static_cast<int> (C.dimension_1()), n = static_cast<int> (C.dimension_2 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_2 () : A.dimension_1 ()); // printf("m:%d,n:%d,k:%d",m,n,k); Kokkos::parallel_for(C.dimension(0),blasOpenMPBatchLeft<Scalar>(A,B,C,m,n,k,transA,transB,alpha,beta)); }
static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> A, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> B, Scalar beta, Kokkos::View<Scalar***,Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> C){ const int m = static_cast<int> (C.dimension_1()), n = static_cast<int> (C.dimension_2 ()), k = (transA == Teuchos::NO_TRANS ? A.dimension_2 () : A.dimension_1 ()); Teuchos::BLAS<int,Scalar>blas; Kokkos::parallel_for(C.dimension_0(),KOKKOS_LAMBDA (const size_t i) { blas.GEMM(transB, transA, n, m, k, alpha, &B(i,0,0), n, &A(i,0,0), k, beta, &C(i,0,0), n); });
KOKKOS_INLINE_FUNCTION void operator() ( const team_member & thread) const { int i = thread.league_rank(); // Allocate a shared array for the team. shared_1d_int count(thread.team_shmem(),data.dimension_1()); // With each team run a parallel_for with its threads Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) { int tsum; // Run a vector loop reduction over the inner dimension of data // Count how many values are multiples of 4 // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) { vsum+= (data(i,j,k) % 4 == 0)?1:0; },tsum); // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute // the next line only once PerThread Kokkos::single(Kokkos::PerThread(thread),[=] () { count(j) = tsum; }); }); // Wait for all threads to finish the parallel_for so that all shared memory writes are done thread.team_barrier(); // Check with one vector lane from each thread how many consecutive // data segments have the same number of values divisible by 4 // The team reduction value is again broadcast to every team member (and every vector lane) int team_sum = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) { // It is not valid to directly add to thread_sum // Use a single function with broadcast instead // team_sum will be used as input to the operator (i.e. it is used to initialize sum) // the end value of sum will be broadcast to all vector lanes in the thread. Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) { if(count(j)==count(j+1)) sum++; },thread_sum); },team_sum); // Add with one thread and vectorlane of the team the team_sum to the global value Kokkos::single(Kokkos::PerTeam(thread),[=] () { Kokkos::atomic_add(&gsum(),team_sum); }); }