  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const int diagA,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;

    // scale the matrix B with alpha
    scaleDenseMatrix(member, alpha, B);

    // Solve a system: AX = B -> B := inv(A) B
    const ordinal_type mA = A.NumRows();
    const ordinal_type nB = B.NumCols();

    if (nB > 0) {
      for (ordinal_type k=0;k<mA;++k) {
        row_view_type &a = A.RowView(k);
        const value_type cdiag = conj(a.Value(0));

        // invert
        if (diagA != Diag::Unit) {
          // b1t = b1t / conj(diag);
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 B.Value(k, j) /= cdiag;

        // update
        const ordinal_type nnz_a = a.NumNonZeros();
        if (nnz_a > 0) {
          // B2 = B2 - trans(conj(a12t)) b1t
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 // grab b1t
                                 const value_type val_at_j = B.Value(k, j);
                                 for (ordinal_type i=1;i<nnz_a;++i) {
                                   // grab a12t
                                   const ordinal_type row_at_i = a.Col(i);
                                   const value_type   val_at_i = conj(a.Value(i));
                                   // update B2
                                   B.Value(row_at_i, j) -= val_at_i*val_at_j;

    return 0;
Пример #2
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const int diagA,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;

    // scale the matrix B with alpha
    scaleDenseMatrix(member, alpha, B);

    // Solve a system: AX = B -> B := inv(A) B
    const ordinal_type mA = A.NumRows();
    const ordinal_type nB = B.NumCols();
    if (nB > 0) {
      for (ordinal_type k=mA-1;k>=0;--k) {
        row_view_type &a = A.RowView(k);
        const value_type diag = a.Value(0);
        // update
        const ordinal_type nnz_a = a.NumNonZeros();
        if (nnz_a > 0) {
          // b1t = b1t - a12t B2 
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=1;i<nnz_a;++i) {
                                   const ordinal_type row_at_i = a.Col(i);   // grab B2 row
                                   const value_type   val_at_i = a.Value(i); // grab a12t value
                                   // update b1t
                                   B.Value(k, j) -= val_at_i*B.Value(row_at_i, j);
          // invert
          if (diagA != Diag::Unit) {
            // b1t = b1t / diag
            Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nB),
                                 [&](const ordinal_type j) {
                                   B.Value(k, j) /= diag;


    return 0;
Пример #3
  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
           const typename CrsExecViewTypeA::policy_type::member_type &member,
           const ScalarType alpha,
           CrsExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
    typedef typename CrsExecViewTypeA::value_type        value_type;
    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
    typedef typename CrsExecViewTypeA::team_factory_type team_factory_type;

    // scale the matrix C with beta
    scaleDenseMatrix<ParallelForType>(member, beta, C);

    // C(i,j) += alpha*A'(i,k)*B(k,j)
    const ordinal_type mA = A.NumRows();
    for (ordinal_type k=0;k<mA;++k) {
      row_view_type &a = A.RowView(k);
      const ordinal_type nnz_a = a.NumNonZeros();
      const ordinal_type nB = B.NumCols();

      if (nnz_a > 0 && nB > 0) {
        ParallelForType(team_factory_type::createThreadLoopRegion(member, 0, nnz_a),
                        [&](const ordinal_type i) {
                          const ordinal_type row_at_i = a.Col(i);
                          const value_type   val_at_ik = conj(a.Value(i));

                          for (ordinal_type j=0;j<nB;++j) {
                            const value_type val_at_kj = B.Value(k, j);
                            C.Value(row_at_i, j) += alpha*val_at_ik*val_at_kj;

    return 0;
  ::invoke(PolicyType &policy,
           MemberType &member,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                Kokkos::Cuda
    //                >::value,
    //                "Cuda space is not available for calling external BLAS" );

    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                typename DenseMatrixTypeB::space_type
    //                >::value && 
    //                Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeB::space_type,
    //                typename DenseMatrixTypeC::space_type
    //                >::value,
    //                "Space type of input matrices does not match" );
    //typedef typename DenseExecViewTypeA::space_type   space_type;
    typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
    typedef typename DenseExecViewTypeA::value_type   value_type;

    if (member.team_rank() == 0) {
      const ordinal_type m = C.NumRows();
      const ordinal_type n = C.NumCols();
      const ordinal_type k = B.NumRows();

      // for now simple implementation
      if (m == 0 || n == 0 || ((alpha == 0 || k == 0) && (beta == 1))) return 0;
      if (alpha == 0) {
        if (beta == 0) {
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, n),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=0;i<m;++i)
                                   C.Value(i, j) = 0.0;
        } else {
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, n),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=0;i<m;++i)
                                   C.Value(i, j) = beta*C.Value(i, j);
      } else {

        // scale beta
        if (beta == 0.0) 
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, n),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=0;i<m;++i)
                                   C.Value(i, j) = 0.0;
        else if (beta != 1.0) 
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, n),
                               [&](const ordinal_type j) {
                                 for (ordinal_type i=0;i<m;++i)
                                   C.Value(i, j) = beta*C.Value(i, j);
        // gemm
        for (ordinal_type l=0;l<k;++l) {      
          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, n),
                               [&](const ordinal_type j) {
                                 const value_type tmp = B.Value(l, j);
                                 //#pragma unroll
                                 for (ordinal_type i=0;i<m;++i)
                                   C.Value(i, j) += A.Value(l, i)*tmp;

    return 0;
  ::invoke(PolicyType &policy,
           MemberType &member,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
    typedef typename DenseExecViewTypeA::value_type   value_type;

    const ordinal_type m = C.NumRows();
    const ordinal_type n = C.NumCols();
    const ordinal_type k = B.NumRows();
    // for now simple implementation
    if (m == 0 || n == 0 || ((alpha == 0 || k == 0) && (beta == 1))) return 0;

    // C = beta C + alpha AB
    if (member.team_rank() == 0) {
      if (alpha == 0) {
        if (beta == 0) {
          for (ordinal_type j=0;j<n;++j)
            for (ordinal_type i=0;i<m;++i)
              C.Value(i, j) = 0.0;
        } else {
          for (ordinal_type j=0;j<n;++j)
            for (ordinal_type i=0;i<m;++i)
              C.Value(i, j) = beta*C.Value(i, j);
      } else {
        // scale beta
        if      (beta == 0.0) 
          for (ordinal_type j=0;j<n;++j)
            for (ordinal_type i=0;i<m;++i)
              C.Value(i, j) = 0.0;
        else if (beta != 1.0) 
          for (ordinal_type j=0;j<n;++j)
            for (ordinal_type i=0;i<m;++i)
              C.Value(i, j) = beta*C.Value(i, j);
        // gemm blocked 
          constexpr ordinal_type mc = 128, nr = 128, kc = 32, nnr = 16;
            // block update
            const ordinal_type mm = m/mc, nn = n/nr, kk = k/kc;
            for (ordinal_type l=0;l<kk;++l)      
              for (ordinal_type i=0;i<mm;++i) 
                for (ordinal_type j=0;j<nn;++j) {
                  const ordinal_type loff = l*kc, moff = i*mc, noff = j*nr;
                  // GEBP : C_ij += A_il B_lj; 
                    constexpr ordinal_type np = (nr/nnr);
                    for (ordinal_type p=0;p<np;++p) {
                      const ordinal_type poff = p*nnr;
                      for (ordinal_type ll=0;ll<kc;++ll)      
                        for (ordinal_type ii=0;ii<mc;++ii) 
                          for (ordinal_type jj=0;jj<nnr;++jj) 
                            C.Value(ii+moff, jj+noff+poff) 
                              += A.Value(ii+moff, ll+loff)*B.Value(ll+loff, jj+noff+poff);
            // remainder
            const ordinal_type lbegin = (k - k%kc), ibegin = (m - m%mc), jbegin = (n - n%nr);
            for (ordinal_type l=lbegin;l<k;++l)       
              for (ordinal_type i=ibegin;i<m;++i)
                for (ordinal_type j=jbegin;j<n;++j) 
                  C.Value(i, j) += A.Value(i, l)*B.Value(l, j);
    return 0;