KOKKOS_INLINE_FUNCTION
  int
  Trsm<Side::Left,Uplo::Upper,Trans::ConjTranspose,
       AlgoTrsm::ForTriSolveBlockedVar1>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const int diagA,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;

    // scale the matrix B with alpha
    scaleDenseMatrix(member, alpha, B);

    // Solve a system: AX = B -> B := inv(A) B
    const ordinal_type mA = A.NumRows();
    const ordinal_type nB = B.NumCols();

    if (nB > 0) {
      for (ordinal_type k=0;k<mA;++k) {
        row_view_type &a = A.RowView(k);
        const value_type cdiag = conj(a.Value(0));

        // invert
        if (diagA != Diag::Unit) {
          // b1t = b1t / conj(diag);
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 B.Value(k, j) /= cdiag;
                               });
        }

        // update
        const ordinal_type nnz_a = a.NumNonZeros();
        if (nnz_a > 0) {
          // B2 = B2 - trans(conj(a12t)) b1t
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 // grab b1t
                                 const value_type val_at_j = B.Value(k, j);
                            
                                 for (ordinal_type i=1;i<nnz_a;++i) {
                                   // grab a12t
                                   const ordinal_type row_at_i = a.Col(i);
                                   const value_type   val_at_i = conj(a.Value(i));
                              
                                   // update B2
                                   B.Value(row_at_i, j) -= val_at_i*val_at_j;
                                 }
                               });
        }
        member.team_barrier();
      }
    }

    return 0;
  }
Exemplo n.º 2
0
  KOKKOS_INLINE_FUNCTION
  int
  Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose,
       AlgoTrsm::ForTriSolveBlocked,Variant::One>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const int diagA,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;

    // scale the matrix B with alpha
    scaleDenseMatrix(member, alpha, B);

    // Solve a system: AX = B -> B := inv(A) B
    const ordinal_type mA = A.NumRows();
    const ordinal_type nB = B.NumCols();
    
    if (nB > 0) {
      for (ordinal_type k=mA-1;k>=0;--k) {
        row_view_type &a = A.RowView(k);
        const value_type diag = a.Value(0);
        
        // update
        const ordinal_type nnz_a = a.NumNonZeros();
        if (nnz_a > 0) {
          // b1t = b1t - a12t B2 
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=1;i<nnz_a;++i) {
                                   const ordinal_type row_at_i = a.Col(i);   // grab B2 row
                                   const value_type   val_at_i = a.Value(i); // grab a12t value
                              
                                   // update b1t
                                   B.Value(k, j) -= val_at_i*B.Value(row_at_i, j);
                                 }
                               });
          
          // invert
          if (diagA != Diag::Unit) {
            // b1t = b1t / diag
            Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                                 [&](const ordinal_type j) {
                                   B.Value(k, j) /= diag;
                                 });

          }
          member.team_barrier();
        }
      }
    }

    return 0;
  }
Exemplo n.º 3
0
  KOKKOS_INLINE_FUNCTION
  int
  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
       AlgoGemm::ForFactorBlocked>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           CrsExecViewTypeB &B,
           const ScalarType beta,
           CrsExecViewTypeC &C) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
    typedef typename CrsExecViewTypeA::team_factory_type team_factory_type;

    // scale the matrix C with beta
    scaleCrsMatrix<ParallelForType>(member, beta, C);

    // C(i,j) += alpha*A'(i,k)*B(k,j)
    const ordinal_type mA = A.NumRows();
    for (ordinal_type k=0;k<mA;++k) {
      row_view_type &a = A.RowView(k);
      const ordinal_type nnz_a = a.NumNonZeros();

      row_view_type &b = B.RowView(k);
      const ordinal_type nnz_b = b.NumNonZeros();

      if (nnz_a > 0 && nnz_b) {
        ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a),
                        [&](const ordinal_type i) {
                          const ordinal_type row_at_i  = a.Col(i);
                          const value_type   val_at_ik = conj(a.Value(i));

                          row_view_type &c = C.RowView(row_at_i);

                          ordinal_type idx = 0;
                          for (ordinal_type j=0;j<nnz_b && (idx > -2);++j) {
                            const ordinal_type col_at_j  = b.Col(j);
                            const value_type   val_at_kj = b.Value(j);

                            idx = c.Index(col_at_j, idx);
                            if (idx >= 0)
                              c.Value(idx) += alpha*val_at_ik*val_at_kj;
                          }
                        });
        member.team_barrier();
      }
    }

    return 0;
  }
Exemplo n.º 4
0
  KOKKOS_INLINE_FUNCTION
  int
  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
       AlgoGemm::ForTriSolveBlocked>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
    typedef typename CrsExecViewTypeA::team_factory_type team_factory_type;

    // scale the matrix C with beta
    scaleDenseMatrix<ParallelForType>(member, beta, C);

    // C(i,j) += alpha*A'(i,k)*B(k,j)
    const ordinal_type mA = A.NumRows();
    for (ordinal_type k=0;k<mA;++k) {
      row_view_type &a = A.RowView(k);
      const ordinal_type nnz_a = a.NumNonZeros();
      const ordinal_type nB = B.NumCols();

      if (nnz_a > 0 && nB > 0) {
        ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a),
                        [&](const ordinal_type i) {
                          const ordinal_type row_at_i = a.Col(i);
                          const value_type   val_at_ik = conj(a.Value(i));

                          for (ordinal_type j=0;j<nB;++j) {
                            const value_type val_at_kj = B.Value(k, j);
                            C.Value(row_at_i, j) += alpha*val_at_ik*val_at_kj;
                          }
                        });
        member.team_barrier();
      }
    }

    return 0;
  }
  KOKKOS_INLINE_FUNCTION
  int
  Herk<Uplo::Upper,Trans::ConjTranspose,
       AlgoHerk::ForFactorBlocked>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const ScalarType alpha,
           typename CrsExecViewTypeA::matrix_type &A,
           const ScalarType beta,
           typename CrsExecViewTypeC::matrix_type &C) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;


if ( false && member.team_rank() == 0 ) {
 printf("Herk [%d +%d)x[%d +%d)\n"
       , C.OffsetRows()
       , C.NumRows()
       , C.OffsetCols()
       , C.NumCols()
       );
}

    // scale the matrix C with beta
    scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C);

    // C(i,j) += alpha*A'(i,k)*A(k,j)
    for (ordinal_type k=0;k<A.NumRows();++k) {
      row_view_type &a = A.RowView(k);
      const ordinal_type nnz = a.NumNonZeros();

      if (nnz > 0) {

#if 0

        Kokkos::parallel_for(
          Kokkos::TeamThreadRange(member, 0, nnz),
            [&](const ordinal_type i) {
              const ordinal_type row_at_i  = a.Col(i);
               // const value_type   val_at_ik = conj(a.Value(i));
               const value_type   val_at_ik = a.Value(i);

               row_view_type &c = C.RowView(row_at_i);

               ordinal_type idx = 0;
               for (ordinal_type j=i;j<nnz && (idx > -2);++j) {
                 const ordinal_type col_at_j  = a.Col(j);
                 const value_type   val_at_kj = a.Value(j);

                 idx = c.Index(col_at_j, idx);
                 if (idx >= 0)
                   c.Value(idx) += alpha*val_at_ik*val_at_kj;
               }
             });
#else

        Kokkos::parallel_for(
          Kokkos::TeamThreadRange(member, 0, nnz*nnz),
            [&](const ordinal_type ii) {
               const ordinal_type i = ii / nnz ;
               const ordinal_type j = ii % nnz ;

               row_view_type &c = C.RowView( a.Col(i) );

               const ordinal_type idx = c.Index( a.Col(j) );

               if (idx >= 0) {
                 c.Value(idx) += alpha* a.Value(i) * a.Value(j);
               }
             });

#endif

        member.team_barrier();
      }
    }

    return 0;
  }
  KOKKOS_INLINE_FUNCTION
  int
  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
       AlgoGemm::ForFactorBlocked>
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const ScalarType alpha,
           typename CrsExecViewTypeA::matrix_type &A,
           typename CrsExecViewTypeB::matrix_type &B,
           const ScalarType beta,
           typename CrsExecViewTypeC::matrix_type &C) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;


if ( false && member.team_rank() == 0 ) {
 printf("Gemm [%d +%d)x[%d +%d)\n"
       , C.OffsetRows()
       , C.NumRows()
       , C.OffsetCols()
       , C.NumCols()
       );
}

    // scale the matrix C with beta
    scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C);

    // Sparse matrix-matrix multiply:
    // C(i,j) += alpha*A'(i,k)*B(k,j)

    const ordinal_type mA = A.NumRows();
    for (ordinal_type k=0;k<mA;++k) {
      row_view_type &a = A.RowView(k);
      const ordinal_type nnz_a = a.NumNonZeros();

      row_view_type &b = B.RowView(k);
      const ordinal_type nnz_b = b.NumNonZeros();

      if (nnz_a > 0 && nnz_b > 0 ) {
#if 0
        Kokkos::parallel_for(
          Kokkos::TeamThreadRange(member, 0, nnz_a),
          [&](const ordinal_type i) {
             const ordinal_type row_at_i  = a.Col(i);
             const value_type   val_at_ik = a.Value(i);
             // const value_type   val_at_ik = conj(a.Value(i));

             row_view_type &c = C.RowView(row_at_i);

             ordinal_type idx = 0;
             for (ordinal_type j=0;j<nnz_b && (idx > -2);++j) {
                const ordinal_type col_at_j  = b.Col(j);
                const value_type   val_at_kj = b.Value(j);

                idx = c.Index(col_at_j, idx);
                if (idx >= 0)
                  c.Value(idx) += alpha*val_at_ik*val_at_kj;
                }
          });
#else
        Kokkos::parallel_for(
          Kokkos::TeamThreadRange(member, 0, nnz_a * nnz_b ),
          [&](const ordinal_type ii) {
             const ordinal_type i = ii / nnz_a ;
             const ordinal_type j = ii % nnz_a ;

             row_view_type &c = C.RowView( a.Col(i) );

             // Binary search for c's index of b.Col(j)
             const ordinal_type idx = c.Index( b.Col(j) );

             if (idx >= 0) {
               // const value_type   val_at_ik = conj(a.Value(i));
               c.Value(idx) += alpha * a.Value(i) * b.Value(j);
             }
          });
#endif

        member.team_barrier();
      }
    }

    return 0;
  }