KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::ConjTranspose, AlgoTrsm::ForTriSolveBlockedVar1> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const int diagA, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; // scale the matrix B with alpha scaleDenseMatrix(member, alpha, B); // Solve a system: AX = B -> B := inv(A) B const ordinal_type mA = A.NumRows(); const ordinal_type nB = B.NumCols(); if (nB > 0) { for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const value_type cdiag = conj(a.Value(0)); // invert if (diagA != Diag::Unit) { // b1t = b1t / conj(diag); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { B.Value(k, j) /= cdiag; }); } // update const ordinal_type nnz_a = a.NumNonZeros(); if (nnz_a > 0) { // B2 = B2 - trans(conj(a12t)) b1t Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { // grab b1t const value_type val_at_j = B.Value(k, j); for (ordinal_type i=1;i<nnz_a;++i) { // grab a12t const ordinal_type row_at_i = a.Col(i); const value_type val_at_i = conj(a.Value(i)); // update B2 B.Value(row_at_i, j) -= val_at_i*val_at_j; } }); } member.team_barrier(); } } return 0; }
KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose, AlgoTrsm::ForTriSolveBlocked,Variant::One> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const int diagA, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; // scale the matrix B with alpha scaleDenseMatrix(member, alpha, B); // Solve a system: AX = B -> B := inv(A) B const ordinal_type mA = A.NumRows(); const ordinal_type nB = B.NumCols(); if (nB > 0) { for (ordinal_type k=mA-1;k>=0;--k) { row_view_type &a = A.RowView(k); const value_type diag = a.Value(0); // update const ordinal_type nnz_a = a.NumNonZeros(); if (nnz_a > 0) { // b1t = b1t - a12t B2 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { for (ordinal_type i=1;i<nnz_a;++i) { const ordinal_type row_at_i = a.Col(i); // grab B2 row const value_type val_at_i = a.Value(i); // grab a12t value // update b1t B.Value(k, j) -= val_at_i*B.Value(row_at_i, j); } }); // invert if (diagA != Diag::Unit) { // b1t = b1t / diag Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { B.Value(k, j) /= diag; }); } member.team_barrier(); } } } return 0; }
KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose, AlgoTrsm::ExternalBlas,Variant::One> ::invoke(PolicyType &policy, const MemberType &member, const int diagA, const ScalarType alpha, DenseExecViewTypeA &A, DenseExecViewTypeB &B) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // Kokkos::Cuda // >::value, // "Cuda space is not available for calling external BLAS" ); // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // typename DenseMatrixTypeB::space_type // >::value, // "Space type of input matrices does not match" ); //typedef typename DenseExecViewTypeA::space_type space_type; typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; if (member.team_rank() == 0) { #ifdef HAVE_SHYLUTACHO_TEUCHOS Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type m = A.NumRows(); const ordinal_type n = B.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS, (diagA == Diag::Unit ? Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG), m, n, alpha, A.ValuePtr(), A.BaseObject().ColStride(), B.ValuePtr(), B.BaseObject().ColStride()); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return 0; }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ForTriSolveBlocked> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B, const ScalarType beta, DenseExecViewTypeC &C) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; typedef typename CrsExecViewTypeA::team_factory_type team_factory_type; // scale the matrix C with beta scaleDenseMatrix<ParallelForType>(member, beta, C); // C(i,j) += alpha*A'(i,k)*B(k,j) const ordinal_type mA = A.NumRows(); for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const ordinal_type nnz_a = a.NumNonZeros(); const ordinal_type nB = B.NumCols(); if (nnz_a > 0 && nB > 0) { ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a), [&](const ordinal_type i) { const ordinal_type row_at_i = a.Col(i); const value_type val_at_ik = conj(a.Value(i)); for (ordinal_type j=0;j<nB;++j) { const value_type val_at_kj = B.Value(k, j); C.Value(row_at_i, j) += alpha*val_at_ik*val_at_kj; } }); member.team_barrier(); } } return 0; }