static void exec(LHS &lhs, RHS const &rhs) { expr::evaluate(rhs); length_type const size = lhs.size(1, 0); for (index_type i=0; i<size; ++i) lhs.put(i, rhs.get(i)); }
static void exec(LHS &lhs, RHS const &rhs) { using namespace impl; typedef typename simd::LValue_access_traits<typename LHS::value_type> WAT; typedef typename simd::Proxy_factory<RHS, false>::access_traits EAT; length_type const vec_size = simd::Simd_traits<typename LHS::value_type>::vec_size; Ext_data<LHS, layout_type> dda(lhs, SYNC_OUT); simd::Proxy<WAT,true> lp(dda.data()); simd::Proxy<EAT,false> rp(simd::Proxy_factory<RHS,false>::create(rhs)); length_type const size = dda.size(0); length_type n = size; // loop using proxy interface. This generates the best code // with gcc 3.4 (with gcc 4.1 the difference to the first case // above is negligible). while (n >= vec_size) { lp.store(rp.load()); n -= vec_size; lp.increment(); rp.increment(); } // Process the remainder, using simple loop fusion. for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i)); }
static void exec(LHS &lhs, RHS const &rhs) { expr::evaluate(rhs); length_type const rows = lhs.size(2, 0); length_type const cols = lhs.size(2, 1); for (index_type j=0; j<cols; ++j) for (index_type i=0; i<rows; ++i) lhs.put(i, j, rhs.get(i, j)); }
static void exec(LHS &lhs, RHS const &rhs) { expr::evaluate(rhs); length_type const size0 = lhs.size(3, 0); length_type const size1 = lhs.size(3, 1); length_type const size2 = lhs.size(3, 2); for (index_type i=0; i<size0; ++i) for (index_type k=0; k<size2; ++k) for (index_type j=0; j<size1; ++j) lhs.put(i, j, k, rhs.get(i, j, k)); }