void GPUDataTransferer<ElemType>::CopyCPUToGPUAsync(ElemType* cpuBuffer, size_t numElements, ElemType* gpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * sizeof(ElemType), cudaMemcpyHostToDevice, m_assignStream) || "cudaMemcpyAsync failed"; cudaEventRecord(m_assignCompleteEvent, m_assignStream) || "cudaEventRecord failed"; }
void GPUDataTransferer<ElemType>::CopyGPUToCPUAsync(ElemType* gpuBuffer, size_t numElements, ElemType* cpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(cpuBuffer, gpuBuffer, numElements * sizeof(ElemType), cudaMemcpyDeviceToHost, m_fetchStream) || "cudaMemcpyAsync failed"; cudaEventRecord(m_fetchCompleteEvent, m_fetchStream) || "cudaEventRecord failed"; }
GPUDataTransferer<ElemType>::GPUDataTransferer(int deviceId, bool useConcurrentStreams) : m_deviceId(deviceId) { PrepareDevice(m_deviceId); // events // Note: Do NOT use cudaEventBlockingSync (which supposedly yields the process)--it will totally break cudaEventSynchronize(), causing it to take 50 or 100 ms randomly. cudaEventCreateWithFlags(&m_fetchCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed"; cudaEventCreateWithFlags(&m_assignCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed"; #pragma warning(disable : 4127) if (useConcurrentStreams && (m_fetchStream == NULL)) { cudaStreamCreateWithFlags(&m_fetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed"; cudaStreamCreateWithFlags(&m_assignStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed"; } }
PrefetchGPUDataTransferer::~PrefetchGPUDataTransferer() { try { PrepareDevice(m_deviceId); } catch (...) { // the error is already logged return; } auto code = cudaStreamDestroy(m_stream); if (code != cudaSuccess) { std::cerr << "cudaStreamDestroy failed (PrefetchGPUDataTransferer dtor): " << cudaGetErrorString(code) << " (cuda error " << code << ")"<< std::endl; } }
void GranularGPUDataTransferer::CopyCPUToGPUAsync(const void* cpuBuffer, size_t numElements, size_t elementSize, void* gpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * elementSize, cudaMemcpyHostToDevice, m_assignStream) || "cudaMemcpyAsync failed"; }
void GranularGPUDataTransferer::WaitForCopyGPUToCPU() { PrepareDevice(m_deviceId); cudaEventSynchronize(m_fetchCompleteEvent) || "cudaEventSynchronize failed"; }
void GPUDataTransferer::WaitForCopyCPUToGPUAsync() { PrepareDevice(m_inner->m_deviceId); SyncEvent(m_inner->m_assignCompleteEvent); }
void GranularGPUDataTransferer::WaitForSyncPointOnAssignStreamAsync() { PrepareDevice(m_deviceId); cudaStreamWaitEvent(m_assignStream, m_syncEvent, 0 /*flags 'must be 0'*/) || "cudaStreamWaitEvent failed"; }
void GranularGPUDataTransferer::RecordComputeStreamSyncPoint() { PrepareDevice(m_deviceId); cudaEventRecord(m_syncEvent, GetStream()) || "cudeEventRecord failed"; }
void GPUDataTransferer<ElemType>::WaitForCopyGPUToCPUAsync() { PrepareDevice(m_deviceId); SyncEvent(m_fetchCompleteEvent); }