template <class T> inline void CUDANodeMemoryModel::copyBuffers(size_t size, const ArrayRCP<const T> &buffSrc, const ArrayRCP<T> &buffDest) { CHECK_COMPUTE_BUFFER(buffSrc); CHECK_COMPUTE_BUFFER(buffDest); TEUCHOS_TEST_FOR_EXCEPTION( (size_t)buffDest.size() < size, std::runtime_error, "CUDANodeMemoryModel::copyBuffers<" << Teuchos::TypeNameTraits<T>::name () << ">: invalid copy. Device destination buffer has size " << buffDest.size () << ", which is less than the requested copy size " << size << "."); TEUCHOS_TEST_FOR_EXCEPTION( (size_t)buffSrc.size() < size, std::runtime_error, "CUDANodeMemoryModel::copyBuffers<" << Teuchos::TypeNameTraits<T>::name () << ">: invalid copy. Device source buffer has size " << buffSrc.size () << ", which is less than the requested copy size " << size << "."); #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_PROFILING ++numCopiesD2D_; bytesCopiedD2D_ += size*sizeof(T); #endif #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_TRACE std::cerr << "copyBuffers<" << Teuchos::TypeNameTraits<T>::name() << "> of size " << sizeof(T) * size << std::endl; #endif cudaError_t err = cudaMemcpy( buffDest.getRawPtr(), buffSrc.getRawPtr(), size*sizeof(T), cudaMemcpyDeviceToDevice); TEUCHOS_TEST_FOR_EXCEPTION( cudaSuccess != err, std::runtime_error, "Kokkos::CUDANodeMemoryModel::copyBuffers<" << Teuchos::TypeNameTraits<T>::name () << ">(): cudaMemcpy() returned error: " << cudaGetErrorString (err) ); }
/*! \brief Copy data between buffers. @param[in] size The size of the copy, greater than zero. @param[in] buffSrc The source buffer, with length at least as large as \c size. @param[in,out] buffDest The destination buffer, with length at least as large as \c size. \post The data is guaranteed to have been copied before any other usage of buffSrc or buffDest occurs. */ template <class T> inline void copyBuffers(size_t size, const ArrayRCP<const T> &buffSrc, const ArrayRCP<T> &buffDest) { if (isHostNode == false) { CHECK_COMPUTE_BUFFER(buffSrc); CHECK_COMPUTE_BUFFER(buffDest); } ArrayView<const T> av_src = buffSrc(0,size); ArrayView<T> av_dst = buffDest(0,size); std::copy(av_src.begin(),av_src.end(),av_dst.begin()); }
inline void CUDANodeMemoryModel::readyBuffers(ArrayView<ArrayRCP<const char> > buffers, ArrayView<ArrayRCP<char> > ncBuffers) { #ifdef HAVE_KOKKOSCLASSIC_DEBUG for (size_t i=0; i < (size_t)buffers.size(); ++i) { CHECK_COMPUTE_BUFFER(buffers[i]); } for (size_t i=0; i < (size_t)ncBuffers.size(); ++i) { CHECK_COMPUTE_BUFFER(ncBuffers[i]); } #endif (void)buffers; (void)ncBuffers; }
inline void readyBuffers(ArrayView<ArrayRCP<const char> > buffers, ArrayView<ArrayRCP<char> > ncBuffers) { #ifdef HAVE_KOKKOSCLASSIC_DEBUG if (isHostNode == false) { for (size_t i=0; i < (size_t)buffers.size(); ++i) { CHECK_COMPUTE_BUFFER(buffers[i]); } for (size_t i=0; i < (size_t)ncBuffers.size(); ++i) { CHECK_COMPUTE_BUFFER(ncBuffers[i]); } } #endif (void)buffers; (void)ncBuffers; }
//! \brief Return a const view of a buffer for use on the host. template <class T> inline ArrayRCP<const T> viewBuffer(size_t size, ArrayRCP<const T> buff) { if (isHostNode == false) { CHECK_COMPUTE_BUFFER(buff); } return buff.persistingView(0,size); }
//! \brief Return a non-const view of a buffer for use on the host. template <class T> inline ArrayRCP<T> viewBufferNonConst(ReadWriteOption rw, size_t size, const ArrayRCP<T> &buff) { (void) rw; // Silence "unused parameter" compiler warning if (isHostNode == false) { CHECK_COMPUTE_BUFFER(buff); } return buff.persistingView(0,size); }
/*! \brief Copy data to host memory from a parallel buffer. @param[in] size The number of entries to copy from \c hostSrc to \c buffDest. @param[in] hostSrc The location in host memory from where the data is copied. @param[out] buffDest The parallel buffer to which the data is copied. \pre \c size is non-negative. \pre \c hostSrc has length equal to \c size. \pre \c buffSrc has length at least <tt>size</tt>. \post On return, entries in the range <tt>[0 , size)</tt> of \c hostSrc are allowed to be written to. The data is guaranteed to be present in \c buffDest before it is used in a parallel computation. */ template <class T> inline void copyToBuffer(size_t size, const ArrayView<const T> &hostSrc, const ArrayRCP<T> &buffDest) { if (isHostNode == false) { CHECK_COMPUTE_BUFFER(buffDest); } ArrayRCP<const T> buffSrc = arcpFromArrayView(hostSrc); copyBuffers<T>(size,buffSrc,buffDest); }
template <class T> inline ArrayRCP<const T> CUDANodeMemoryModel::viewBuffer(size_t size, ArrayRCP<const T> buff) { CHECK_COMPUTE_BUFFER(buff); ArrayRCP<T> hostBuff; if (size != 0) { hostBuff = arcp<T>(size); #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_TRACE std::cerr << "viewBuffer() -> "; #endif this->template copyFromBuffer<T>(size,buff,hostBuff()); } return hostBuff; }
template <class T> inline ArrayRCP<T> CUDANodeMemoryModel::viewBufferNonConst(ReadWriteOption rw, size_t size, const ArrayRCP<T> &buff) { CHECK_COMPUTE_BUFFER(buff); // Create a copy-back deallocator that copies back to buff. CUDANodeCopyBackDeallocator<T> dealloc(buff.persistingView(0,size), rcpFromRef(*this)); // It allocates a host buffer with the appropriate deallocator embedded. ArrayRCP<T> hostBuff = dealloc.alloc(); if (rw == ReadWrite) { #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_TRACE std::cerr << "viewBufferNonConst(ReadWrite) -> "; #endif this->template copyFromBuffer<T>(size, buff, hostBuff()); } else { #ifdef HAVE_KOKKOSCLASSIC_CUDA_NODE_MEMORY_TRACE std::cerr << "viewBufferNonConst(WriteOnly)" << std::endl; #endif } // else rw == WriteOnly, and we need no copy return hostBuff; }