inline void copy( In const& a, Out& b , HDI const& , HDO const& , cudaStream_t stream = 0) { using T = typename Out::value_type; //TODO CUDA_ERROR(cudaMemcpyAsync( (T*)b.data() , a.data() , a.size()* sizeof(T) , copy_<HDI,HDO>::mode() , stream )); }
inline void transfer_dth( Out & out , int blockid, Stream & stream ,std::size_t streamid , std::size_t leftover , nt2::pinned_ &) { std::size_t sizeb = blocksize; if(leftover !=0) sizeb = leftover ; if(block_stream_dth[blockid] == false ) { CUDA_ERROR(cudaMemcpyAsync( out.data() , buffers.get_device(streamid) , sizeb * sizeof(T) , cudaMemcpyDeviceToHost , stream )); block_stream_dth[blockid] = true; cudaStreamSynchronize(stream); } }