Exemplo n.º 1
0
    void operator()(std::size_t begin, std::size_t size)
    {
       extent_type ext = in_.extent();
       std::size_t ibound = boost::fusion::at_c<0>(ext);
       std::size_t mbound = boost::fusion::at_c<1>(ext);

       for( std::size_t i=0, i_ = o_ + begin; i<size; ++i, ++i_)
       {
         value_type summary = neutral_(meta::as_<value_type>());
         summary = details::scan_step( summary
                                     , out_, in_
                                     , bop_
                                     , i_, mbound, ibound
                                     , false);
       }
    }
Exemplo n.º 2
0
      void operator()(std::size_t begin, std::size_t size) const
      {
        extent_type ext = in_.extent();
        std::size_t ibound = boost::fusion::at_c<0>(ext);
        std::size_t mbound = boost::fusion::at_c<1>(ext);
        std::size_t obound = boost::fusion::at_c<2>(ext);
        std::size_t iboundxmbound =  ibound * mbound;

        std::size_t cache_line_size = nt2::config::top_cache_line_size(2)/sizeof(value_type);

        std::size_t grain  = cache_line_size;

        // Compute the lower multiple of grain of mbound
        std::size_t mmbound =  (mbound/grain)*grain;

         if( ibound < grain )
         {
          // Instanciate the spawner/worker associated to the mbound dimension

          nt2::spawner<tag::scan_, BackEnd, value_type> s;

          for(std::size_t o = 0, o_ = begin * iboundxmbound;
              o < size;
              ++o, o_+= iboundxmbound)
          {
            for(std::size_t i = 0,  i_ = o_;
                i < ibound;
                ++i, ++i_)
            {

              nt2::worker<tag::outer_scan_step_outcache_,BackEnd,Site,Out,In,Neutral,Bop>
              w(out_,in_,neutral_,bop_,i_);

              value_type s_out = neutral_(nt2::meta::as_<value_type>());

              if( (size == obound) && (grain < mmbound) )
               s_out = s( w, 0, mmbound, grain);

              else if (mmbound != 0)
               s_out = w(s_out, 0, mmbound, false);

              s_out = w(s_out, mmbound, mbound-mmbound, false);
            }
          }
         }

         else
         {
           // Instanciate the spawner/worker associated to the ibound dimension
           nt2::spawner<tag::transform_, BackEnd> s;

           for(std::size_t o = 0, o_ = begin * iboundxmbound;
               o < size;
               ++o, o_+= iboundxmbound)
           {
             nt2::worker<tag::outer_scan_step_incache_,BackEnd,Site,Out,In,Neutral,Bop>
             w(out_,in_,neutral_,bop_, o_);

             if( size == obound )
               s(w,0,ibound,grain);

             else
               w(0,ibound);
           }
         }
      }
Exemplo n.º 3
0
      void operator()(std::size_t begin, std::size_t size) const
      {
        extent_type ext = in_.extent();
        static const std::size_t N = boost::simd::meta::cardinal_of<target_type>::value;
        std::size_t ibound = boost::fusion::at_c<0>(ext);
        std::size_t mbound = boost::fusion::at_c<1>(ext);
        std::size_t obound = boost::fusion::at_c<2>(ext);
        std::size_t iboundxmbound =  ibound * mbound;

        std::size_t cache_line_size = nt2::config::top_cache_line_size(2) / sizeof(value_type);
        std::size_t grain  = cache_line_size;

        // Compute the lower multiple of cache_line of ibound
        std::size_t cache_bound = (cache_line_size / (sizeof(value_type)*N))*N;
        std::size_t iibound =  boost::simd::align_under(ibound, cache_bound);

        // Compute the lower multiple of grain of mbound
        std::size_t mmbound =  (mbound/grain)*grain;

        if( iibound < grain )
        {
          // Instanciate the spawner/worker associated to the mbound dimension
          nt2::spawner<tag::fold_, BackEnd, target_type> s_simd;
          nt2::spawner<tag::fold_, BackEnd, value_type> s_scalar;

          nt2::worker<tag::outer_fold_step_outcache_,BackEnd,Site,In,Neutral,Bop>
          w(in_,neutral_,bop_);

          for(std::size_t o = begin, oout_ = begin*ibound, oin_ = begin * iboundxmbound;
              o < begin + size;
              ++o, oout_+=ibound, oin_+= iboundxmbound)
          {
            // parallelized part
            for (std::size_t i = 0, kout_ = oout_, kin_ = oin_;
                 i < iibound;
                 i+=N, kout_+=N, kin_+=N)
            {
              w.update(kin_);

              target_type result = neutral_(nt2::meta::as_<target_type>());

              if( (size == obound) && (grain < mmbound) )
                  result = s_simd( w, 0, mmbound, grain);
              else if(mmbound != 0)
                  result = w(result, 0, mmbound);

              result = w(result, mmbound, mbound-mmbound);

              nt2::run(out_, kout_, result);
            }

            // scalar part
            for(std::size_t i = iibound, kout_ = oout_ + iibound, kin_ = oin_ + iibound;
                i < ibound;
                ++i, ++kout_, ++kin_)
            {
              w.update(kin_);

              value_type result = neutral_(nt2::meta::as_<value_type>());

              if( (size == obound) && (grain < mmbound) )
                  result = s_scalar(w, 0, mmbound, grain);
              else if(mmbound != 0)
                  result = w(result, 0, mmbound);

              result = w(result, mmbound, mbound-mmbound);

              nt2::run(out_, kout_, result);
            }
          }
        }

        else
        {
          // Instanciate the spawner/worker associated to the ibound dimension
          nt2::spawner<tag::transform_, BackEnd> s;

          // vectorized worker
          nt2::worker<tag::outer_fold_step_incache_,BackEnd,Site,Out,In,Neutral,Bop>
          w1(out_,in_,neutral_,bop_);

          // scalar worker
          nt2::worker<tag::outer_fold_step_incache_,BackEnd,tag::cpu_,Out,In,Neutral,Bop>
          w2(out_,in_,neutral_,bop_);

          for(std::size_t o = 0, oout_ = begin*ibound, oin_ = begin * iboundxmbound;
              o < size;
              ++o, oout_+=ibound, oin_+= iboundxmbound)
          {
            w1.update(oout_, oin_);
            w2.update(oout_, oin_);

            // parallelized part
            if((size == obound) && (grain < iibound))
              s(w1,0,iibound,grain);

            else if(iibound != 0)
              w1(0,iibound);

            // scalar part
            w2(iibound,ibound-iibound);
          }
        }
      }