KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::ConjTranspose, AlgoTrsm::ForTriSolveBlockedVar1> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const int diagA, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; // scale the matrix B with alpha scaleDenseMatrix(member, alpha, B); // Solve a system: AX = B -> B := inv(A) B const ordinal_type mA = A.NumRows(); const ordinal_type nB = B.NumCols(); if (nB > 0) { for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const value_type cdiag = conj(a.Value(0)); // invert if (diagA != Diag::Unit) { // b1t = b1t / conj(diag); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { B.Value(k, j) /= cdiag; }); } // update const ordinal_type nnz_a = a.NumNonZeros(); if (nnz_a > 0) { // B2 = B2 - trans(conj(a12t)) b1t Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { // grab b1t const value_type val_at_j = B.Value(k, j); for (ordinal_type i=1;i<nnz_a;++i) { // grab a12t const ordinal_type row_at_i = a.Col(i); const value_type val_at_i = conj(a.Value(i)); // update B2 B.Value(row_at_i, j) -= val_at_i*val_at_j; } }); } member.team_barrier(); } } return 0; }
KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose, AlgoTrsm::ForTriSolveBlocked,Variant::One> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const int diagA, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; // scale the matrix B with alpha scaleDenseMatrix(member, alpha, B); // Solve a system: AX = B -> B := inv(A) B const ordinal_type mA = A.NumRows(); const ordinal_type nB = B.NumCols(); if (nB > 0) { for (ordinal_type k=mA-1;k>=0;--k) { row_view_type &a = A.RowView(k); const value_type diag = a.Value(0); // update const ordinal_type nnz_a = a.NumNonZeros(); if (nnz_a > 0) { // b1t = b1t - a12t B2 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { for (ordinal_type i=1;i<nnz_a;++i) { const ordinal_type row_at_i = a.Col(i); // grab B2 row const value_type val_at_i = a.Value(i); // grab a12t value // update b1t B.Value(k, j) -= val_at_i*B.Value(row_at_i, j); } }); // invert if (diagA != Diag::Unit) { // b1t = b1t / diag Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB), [&](const ordinal_type j) { B.Value(k, j) /= diag; }); } member.team_barrier(); } } } return 0; }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ForFactorBlocked> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const ScalarType alpha, CrsExecViewTypeA &A, CrsExecViewTypeB &B, const ScalarType beta, CrsExecViewTypeC &C) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; typedef typename CrsExecViewTypeA::team_factory_type team_factory_type; // scale the matrix C with beta scaleCrsMatrix<ParallelForType>(member, beta, C); // C(i,j) += alpha*A'(i,k)*B(k,j) const ordinal_type mA = A.NumRows(); for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const ordinal_type nnz_a = a.NumNonZeros(); row_view_type &b = B.RowView(k); const ordinal_type nnz_b = b.NumNonZeros(); if (nnz_a > 0 && nnz_b) { ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a), [&](const ordinal_type i) { const ordinal_type row_at_i = a.Col(i); const value_type val_at_ik = conj(a.Value(i)); row_view_type &c = C.RowView(row_at_i); ordinal_type idx = 0; for (ordinal_type j=0;j<nnz_b && (idx > -2);++j) { const ordinal_type col_at_j = b.Col(j); const value_type val_at_kj = b.Value(j); idx = c.Index(col_at_j, idx); if (idx >= 0) c.Value(idx) += alpha*val_at_ik*val_at_kj; } }); member.team_barrier(); } } return 0; }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ForTriSolveBlocked> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const ScalarType alpha, CrsExecViewTypeA &A, DenseExecViewTypeB &B, const ScalarType beta, DenseExecViewTypeC &C) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; typedef typename CrsExecViewTypeA::team_factory_type team_factory_type; // scale the matrix C with beta scaleDenseMatrix<ParallelForType>(member, beta, C); // C(i,j) += alpha*A'(i,k)*B(k,j) const ordinal_type mA = A.NumRows(); for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const ordinal_type nnz_a = a.NumNonZeros(); const ordinal_type nB = B.NumCols(); if (nnz_a > 0 && nB > 0) { ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a), [&](const ordinal_type i) { const ordinal_type row_at_i = a.Col(i); const value_type val_at_ik = conj(a.Value(i)); for (ordinal_type j=0;j<nB;++j) { const value_type val_at_kj = B.Value(k, j); C.Value(row_at_i, j) += alpha*val_at_ik*val_at_kj; } }); member.team_barrier(); } } return 0; }
KOKKOS_INLINE_FUNCTION int Herk<Uplo::Upper,Trans::ConjTranspose, AlgoHerk::ForFactorBlocked> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const ScalarType alpha, typename CrsExecViewTypeA::matrix_type &A, const ScalarType beta, typename CrsExecViewTypeC::matrix_type &C) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; if ( false && member.team_rank() == 0 ) { printf("Herk [%d +%d)x[%d +%d)\n" , C.OffsetRows() , C.NumRows() , C.OffsetCols() , C.NumCols() ); } // scale the matrix C with beta scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C); // C(i,j) += alpha*A'(i,k)*A(k,j) for (ordinal_type k=0;k<A.NumRows();++k) { row_view_type &a = A.RowView(k); const ordinal_type nnz = a.NumNonZeros(); if (nnz > 0) { #if 0 Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, nnz), [&](const ordinal_type i) { const ordinal_type row_at_i = a.Col(i); // const value_type val_at_ik = conj(a.Value(i)); const value_type val_at_ik = a.Value(i); row_view_type &c = C.RowView(row_at_i); ordinal_type idx = 0; for (ordinal_type j=i;j<nnz && (idx > -2);++j) { const ordinal_type col_at_j = a.Col(j); const value_type val_at_kj = a.Value(j); idx = c.Index(col_at_j, idx); if (idx >= 0) c.Value(idx) += alpha*val_at_ik*val_at_kj; } }); #else Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, nnz*nnz), [&](const ordinal_type ii) { const ordinal_type i = ii / nnz ; const ordinal_type j = ii % nnz ; row_view_type &c = C.RowView( a.Col(i) ); const ordinal_type idx = c.Index( a.Col(j) ); if (idx >= 0) { c.Value(idx) += alpha* a.Value(i) * a.Value(j); } }); #endif member.team_barrier(); } } return 0; }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ForFactorBlocked> ::invoke(typename CrsExecViewTypeA::policy_type &policy, const typename CrsExecViewTypeA::policy_type::member_type &member, const ScalarType alpha, typename CrsExecViewTypeA::matrix_type &A, typename CrsExecViewTypeB::matrix_type &B, const ScalarType beta, typename CrsExecViewTypeC::matrix_type &C) { typedef typename CrsExecViewTypeA::ordinal_type ordinal_type; typedef typename CrsExecViewTypeA::value_type value_type; typedef typename CrsExecViewTypeA::row_view_type row_view_type; if ( false && member.team_rank() == 0 ) { printf("Gemm [%d +%d)x[%d +%d)\n" , C.OffsetRows() , C.NumRows() , C.OffsetCols() , C.NumCols() ); } // scale the matrix C with beta scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C); // Sparse matrix-matrix multiply: // C(i,j) += alpha*A'(i,k)*B(k,j) const ordinal_type mA = A.NumRows(); for (ordinal_type k=0;k<mA;++k) { row_view_type &a = A.RowView(k); const ordinal_type nnz_a = a.NumNonZeros(); row_view_type &b = B.RowView(k); const ordinal_type nnz_b = b.NumNonZeros(); if (nnz_a > 0 && nnz_b > 0 ) { #if 0 Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, nnz_a), [&](const ordinal_type i) { const ordinal_type row_at_i = a.Col(i); const value_type val_at_ik = a.Value(i); // const value_type val_at_ik = conj(a.Value(i)); row_view_type &c = C.RowView(row_at_i); ordinal_type idx = 0; for (ordinal_type j=0;j<nnz_b && (idx > -2);++j) { const ordinal_type col_at_j = b.Col(j); const value_type val_at_kj = b.Value(j); idx = c.Index(col_at_j, idx); if (idx >= 0) c.Value(idx) += alpha*val_at_ik*val_at_kj; } }); #else Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, nnz_a * nnz_b ), [&](const ordinal_type ii) { const ordinal_type i = ii / nnz_a ; const ordinal_type j = ii % nnz_a ; row_view_type &c = C.RowView( a.Col(i) ); // Binary search for c's index of b.Col(j) const ordinal_type idx = c.Index( b.Col(j) ); if (idx >= 0) { // const value_type val_at_ik = conj(a.Value(i)); c.Value(idx) += alpha * a.Value(i) * b.Value(j); } }); #endif member.team_barrier(); } } return 0; }