void operator()(std::size_t begin, std::size_t size) { extent_type ext = in_.extent(); std::size_t ibound = boost::fusion::at_c<0>(ext); std::size_t mbound = boost::fusion::at_c<1>(ext); for( std::size_t i=0, i_ = o_ + begin; i<size; ++i, ++i_) { value_type summary = neutral_(meta::as_<value_type>()); summary = details::scan_step( summary , out_, in_ , bop_ , i_, mbound, ibound , false); } }
void operator()(std::size_t begin, std::size_t size) const { extent_type ext = in_.extent(); std::size_t ibound = boost::fusion::at_c<0>(ext); std::size_t mbound = boost::fusion::at_c<1>(ext); std::size_t obound = boost::fusion::at_c<2>(ext); std::size_t iboundxmbound = ibound * mbound; std::size_t cache_line_size = nt2::config::top_cache_line_size(2)/sizeof(value_type); std::size_t grain = cache_line_size; // Compute the lower multiple of grain of mbound std::size_t mmbound = (mbound/grain)*grain; if( ibound < grain ) { // Instanciate the spawner/worker associated to the mbound dimension nt2::spawner<tag::scan_, BackEnd, value_type> s; for(std::size_t o = 0, o_ = begin * iboundxmbound; o < size; ++o, o_+= iboundxmbound) { for(std::size_t i = 0, i_ = o_; i < ibound; ++i, ++i_) { nt2::worker<tag::outer_scan_step_outcache_,BackEnd,Site,Out,In,Neutral,Bop> w(out_,in_,neutral_,bop_,i_); value_type s_out = neutral_(nt2::meta::as_<value_type>()); if( (size == obound) && (grain < mmbound) ) s_out = s( w, 0, mmbound, grain); else if (mmbound != 0) s_out = w(s_out, 0, mmbound, false); s_out = w(s_out, mmbound, mbound-mmbound, false); } } } else { // Instanciate the spawner/worker associated to the ibound dimension nt2::spawner<tag::transform_, BackEnd> s; for(std::size_t o = 0, o_ = begin * iboundxmbound; o < size; ++o, o_+= iboundxmbound) { nt2::worker<tag::outer_scan_step_incache_,BackEnd,Site,Out,In,Neutral,Bop> w(out_,in_,neutral_,bop_, o_); if( size == obound ) s(w,0,ibound,grain); else w(0,ibound); } } }
void operator()(std::size_t begin, std::size_t size) const { extent_type ext = in_.extent(); static const std::size_t N = boost::simd::meta::cardinal_of<target_type>::value; std::size_t ibound = boost::fusion::at_c<0>(ext); std::size_t mbound = boost::fusion::at_c<1>(ext); std::size_t obound = boost::fusion::at_c<2>(ext); std::size_t iboundxmbound = ibound * mbound; std::size_t cache_line_size = nt2::config::top_cache_line_size(2) / sizeof(value_type); std::size_t grain = cache_line_size; // Compute the lower multiple of cache_line of ibound std::size_t cache_bound = (cache_line_size / (sizeof(value_type)*N))*N; std::size_t iibound = boost::simd::align_under(ibound, cache_bound); // Compute the lower multiple of grain of mbound std::size_t mmbound = (mbound/grain)*grain; if( iibound < grain ) { // Instanciate the spawner/worker associated to the mbound dimension nt2::spawner<tag::fold_, BackEnd, target_type> s_simd; nt2::spawner<tag::fold_, BackEnd, value_type> s_scalar; nt2::worker<tag::outer_fold_step_outcache_,BackEnd,Site,In,Neutral,Bop> w(in_,neutral_,bop_); for(std::size_t o = begin, oout_ = begin*ibound, oin_ = begin * iboundxmbound; o < begin + size; ++o, oout_+=ibound, oin_+= iboundxmbound) { // parallelized part for (std::size_t i = 0, kout_ = oout_, kin_ = oin_; i < iibound; i+=N, kout_+=N, kin_+=N) { w.update(kin_); target_type result = neutral_(nt2::meta::as_<target_type>()); if( (size == obound) && (grain < mmbound) ) result = s_simd( w, 0, mmbound, grain); else if(mmbound != 0) result = w(result, 0, mmbound); result = w(result, mmbound, mbound-mmbound); nt2::run(out_, kout_, result); } // scalar part for(std::size_t i = iibound, kout_ = oout_ + iibound, kin_ = oin_ + iibound; i < ibound; ++i, ++kout_, ++kin_) { w.update(kin_); value_type result = neutral_(nt2::meta::as_<value_type>()); if( (size == obound) && (grain < mmbound) ) result = s_scalar(w, 0, mmbound, grain); else if(mmbound != 0) result = w(result, 0, mmbound); result = w(result, mmbound, mbound-mmbound); nt2::run(out_, kout_, result); } } } else { // Instanciate the spawner/worker associated to the ibound dimension nt2::spawner<tag::transform_, BackEnd> s; // vectorized worker nt2::worker<tag::outer_fold_step_incache_,BackEnd,Site,Out,In,Neutral,Bop> w1(out_,in_,neutral_,bop_); // scalar worker nt2::worker<tag::outer_fold_step_incache_,BackEnd,tag::cpu_,Out,In,Neutral,Bop> w2(out_,in_,neutral_,bop_); for(std::size_t o = 0, oout_ = begin*ibound, oin_ = begin * iboundxmbound; o < size; ++o, oout_+=ibound, oin_+= iboundxmbound) { w1.update(oout_, oin_); w2.update(oout_, oin_); // parallelized part if((size == obound) && (grain < iibound)) s(w1,0,iibound,grain); else if(iibound != 0) w1(0,iibound); // scalar part w2(iibound,ibound-iibound); } } }