// 内部使用的 // 如果当前未初始化直接在GPU分配内存 // 如果当前在CPU,则在GPU上分配内存并且复制到GPU // 如果数据已经在GPU则啥也不做 inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: // 获取设备 CUDA_CHECK(cudaGetDevice(&gpu_device_)); // 在设备上分配内存 CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); // 初始化为0 caffe_gpu_memset(size_, 0, gpu_ptr_); head_ = HEAD_AT_GPU; own_gpu_data_ = true; break; case HEAD_AT_CPU: if (gpu_ptr_ == NULL) { CUDA_CHECK(cudaGetDevice(&gpu_device_)); CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); own_gpu_data_ = true; } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }
TEST_F(SyncedMemoryTest, TestGPUWrite) { SyncedMemory mem(10); void* gpu_data = mem.mutable_gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU); caffe_gpu_memset(mem.size(), 1, gpu_data); const void* cpu_data = mem.cpu_data(); for (int i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast<const char*>(cpu_data))[i], 1); } EXPECT_EQ(mem.head(), SyncedMemory::SYNCED); gpu_data = mem.mutable_gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU); caffe_gpu_memset(mem.size(), 2, gpu_data); cpu_data = mem.cpu_data(); for (int i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast<const char*>(cpu_data))[i], 2); } EXPECT_EQ(mem.head(), SyncedMemory::SYNCED); }
inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); caffe_gpu_memset(size_, 0, gpu_ptr_); head_ = HEAD_AT_GPU; break; case HEAD_AT_CPU: if (gpu_ptr_ == NULL) { CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }
inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: { if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); device_context_->IncreaseMemoryUsage(size_); caffe_gpu_memset(size_, 0, gpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); ctx.get_queue().finish(); cl_int err; if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, nullptr, &err); } else { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, size_, nullptr, &err); } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; device_context_->IncreaseMemoryUsage(size_); int alpha = 0; greentea_memset(device_context_->id(), size_, alpha, cl_gpu_mem_, 0); gpu_ptr_ = reinterpret_cast<void*>(cl_gpu_mem_); ctx.get_queue().finish(); #endif // USE_GREENTEA } head_ = HEAD_AT_GPU; break; } case HEAD_AT_CPU: { if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA if (gpu_ptr_ == nullptr) { CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); device_context_->IncreaseMemoryUsage(size_); } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); ctx.get_queue().finish(); if (gpu_ptr_ == nullptr) { cl_int err; if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { cl_gpu_mem_ = clCreateBuffer( ctx.handle().get(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, nullptr, &err); } else { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, size_, nullptr, &err); } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; device_context_->IncreaseMemoryUsage(size_); gpu_ptr_ = reinterpret_cast<void*>(cl_gpu_mem_); ctx.get_queue().finish(); } greentea_gpu_memcpy(size_, cpu_ptr_, (cl_mem) gpu_ptr_, 0, &ctx); ctx.get_queue().finish(); #endif // USE_GREENTEA } head_ = SYNCED; break; } case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }