static void apply( const tensor_type & tensor , const MatrixValue * const a , const VectorValue * const x , VectorValue * const y ) { const size_type nDim = tensor.dimension(); // Loop over i for ( size_type i = 0; i < nDim; ++i) { VectorValue ytmp = 0; // Loop over k for this i const size_type nk = tensor.num_k(i); const size_type kBeg = tensor.k_begin(i); const size_type kEnd = kBeg + nk; for (size_type kEntry = kBeg; kEntry < kEnd; ++kEntry) { const size_type k = tensor.k_coord(kEntry); const MatrixValue ak = a[k]; const VectorValue xk = x[k]; // Loop over j for this i,k const size_type nj = tensor.num_j(kEntry); const size_type jBeg = tensor.j_begin(kEntry); const size_type jEnd = jBeg + nj; for (size_type jEntry = jBeg; jEntry < jEnd; ++jEntry) { const size_type j = tensor.j_coord(jEntry); ytmp += tensor.value(jEntry) * ( a[j] * xk + ak * x[j] ); } } y[i] += ytmp ; } }
KOKKOS_INLINE_FUNCTION static void apply( const tensor_type & tensor , const MatrixValue * const a , const VectorValue * const x , VectorValue * const y ) { const size_type nk = tensor.num_k(); // Loop over k for ( size_type k = 0; k < nk; ++k) { const MatrixValue ak = a[k]; const VectorValue xk = x[k]; // Loop over j for this k const size_type nj = tensor.num_j(k); const size_type jBeg = tensor.j_begin(k); const size_type jEnd = jBeg + nj; for (size_type jEntry = jBeg; jEntry < jEnd; ++jEntry) { const size_type j = tensor.j_coord(jEntry); VectorValue tmp = a[j] * xk + ak * x[j]; // Loop over i for this k,j const size_type ni = tensor.num_i(jEntry); const size_type iBeg = tensor.i_begin(jEntry); const size_type iEnd = iBeg + ni; for (size_type iEntry = iBeg; iEntry < iEnd; ++iEntry) { const size_type i = tensor.i_coord(iEntry); y[i] += tensor.value(iEntry) * tmp; } } } }
static void apply( const tensor_type & tensor , const MatrixValue * const a , const VectorValue * const x , VectorValue * const y ) { // const int max_size = 10; // MatrixValue ax[max_size][max_size]; const size_type nBlock = tensor.num_coord(); // Loop over coordinate blocks size_type value_entry = 0; for ( size_type block = 0; block < nBlock; ++block) { const size_type i_begin = tensor.get_i_begin(block); const size_type j_begin = tensor.get_j_begin(block); const size_type k_begin = tensor.get_k_begin(block); const size_type i_size = tensor.get_i_size(block); const size_type j_size = tensor.get_j_size(block); const size_type k_size = tensor.get_k_size(block); VectorValue * const y_block = y + i_begin; const MatrixValue * const a_block = a + j_begin; const VectorValue * const x_block = x + k_begin; // // Precompute a*x outer product // for (size_type j=0; j<j_size; ++j) { // for (size_type k=0; k<k_size; ++k) { // ax[j][k] = a_block[j]*x_block[k]; // } // } /* // Compute y_i = \sum_{j,k} c_{ijk} * a_j * x_k for (size_type i=0; i<i_size; ++i) { VectorValue ytmp = 0; for (size_type j=0; j<j_size; ++j) { const size_type imj = i-j; const size_type ipj = i+j+1; const size_type k_beg = 0 <= imj ? imj : -imj; const size_type k_end = k_size <= ipj ? k_size : ipj; const size_type k0 = k_beg % 2 == (i+j) % 2 ? k_beg : k_beg+1; for (size_type k=k0; k<k_end; ++k) { //ytmp += tensor.value(value_entry++) * ax[j][k]; ytmp += tensor.value(value_entry++) * ( a_block[j] * x_block[k] ); } } y_block[i] += ytmp ; } */ // Compute y_i = \sum_{j,k} c_{ijk} * a_j * x_k for (size_type i=0; i<i_size; ++i) { VectorValue ytmp = 0; for (size_type j=0; j<j_size; ++j) { for (size_type k=((i+j)%2); k<k_size; k+=2) { ytmp += tensor.value(value_entry++) * ( a_block[j] * x_block[k] ); } } y_block[i] += ytmp ; } } }