TEST(HostAlloc, FlagRetrieval) { cudaError_t ret; void * ptrs[8]; unsigned int flags[8]; int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); struct cudaDeviceProp prop; ret = cudaGetDeviceProperties(&prop, device); ASSERT_EQ(cudaSuccess, ret); for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { unsigned int flag = cudaHostAllocDefault; if (i & 0x1) { flag |= cudaHostAllocPortable; } if (i & 0x2) { flag |= cudaHostAllocMapped; } if (i & 0x4) { flag |= cudaHostAllocWriteCombined; } ret = cudaHostAlloc(&ptrs[i], 4, flag); ASSERT_EQ(cudaSuccess, ret); flags[i] = flag; } for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { unsigned int flag; ret = cudaHostGetFlags(&flag, ptrs[i]); ASSERT_EQ(cudaSuccess, ret); const unsigned int expected = flags[i] | (prop.canMapHostMemory ? cudaHostAllocMapped : 0); EXPECT_EQ(expected, flag); } for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { ret = cudaFreeHost(ptrs[i]); EXPECT_EQ(cudaSuccess, ret); } }
void AsyncCopier::copyHtoD(void* dest, const void* src, size_t size) { VLOG(1) << "copyHtoD " << size; auto pdest = static_cast<uint8_t*>(dest); auto psrc = static_cast<const uint8_t*>(src); unsigned int flags; auto err = cudaHostGetFlags(&flags, const_cast<void*>(src)); if (err == cudaSuccess) { // Page-locked using cudaHostAlloc / cudaHostRegister, copy directly. checkCudaError(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice), "cudaMemcpyAsync"); return; } else if (err != cudaErrorInvalidValue) { checkCudaError(err, "invalid return code from cudaMemcpyAsync"); } cudaGetLastError(); // reset last error // This is dicey -- what if another kernel has completed with an error? // But there's nothing else we can do, as any cuda function may return an // error from a previous kernel launch. if (size > bufferSize_) { // Copy synchronously. checkCudaError(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice), "cudaMemcpy"); return; } Event* eventToWait = nullptr; auto copyRange = [this, &size, &pdest, &psrc] (AllocatedBlock& range) { size_t n = std::min(size, range.length); range.length = n; VLOG(1) << "Copy " << range.start << " + " << n; auto bufPtr = buffer_.get() + range.start; memcpy(bufPtr, psrc, n); checkCudaError(cudaMemcpyAsync(pdest, bufPtr, n, cudaMemcpyHostToDevice), "cudaMemcpyAsync"); pdest += n; psrc += n; size -= n; checkCudaError(cudaEventRecord(*range.event->event), "cudaEventRecord"); allocated_.push_back(range); }; for (;;) { { std::lock_guard<std::mutex> lock(mutex_); if (eventToWait) { releaseEventLocked(eventToWait); eventToWait = nullptr; } // Always reap while (!allocated_.empty() && pollEvent(allocated_.front().event)) { releaseEventLocked(allocated_.front().event); allocated_.pop_front(); } auto ranges = getRangesLocked(); if (!ranges.empty()) { auto ev = getEventLocked(); for (auto it = ranges.begin(); size != 0 && it != ranges.end(); ++it) { auto& range = *it; ++ev->refCount; range.event = ev; copyRange(range); } releaseEventLocked(ev); if (size == 0) { break; } } // Sigh, we have to wait. eventToWait = allocated_.front().event; ++eventToWait->refCount; } DCHECK(eventToWait); VLOG(1) << "Waiting, remaining " << size; waitEvent(eventToWait); } VLOG(1) << "End copyHtoD"; DCHECK(!eventToWait); }
cudaError_t WINAPI wine_cudaHostGetFlags(unsigned int *pFlags, void *pHost) { WINE_TRACE("\n"); return cudaHostGetFlags( pFlags, pHost ); }