TEST_F(SyncedMemoryTest, TestGPURead) { SyncedMemory mem(10); void* cpu_data = mem.mutable_cpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU); caffe_memset(mem.size(), 1, cpu_data); const void* gpu_data = mem.gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::SYNCED); // check if values are the same char* recovered_value = new char[10]; caffe_gpu_memcpy(10, gpu_data, recovered_value); for (int i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast<char*>(recovered_value))[i], 1); } // do another round cpu_data = mem.mutable_cpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_CPU); caffe_memset(mem.size(), 2, cpu_data); for (int i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast<char*>(cpu_data))[i], 2); } gpu_data = mem.gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::SYNCED); // check if values are the same caffe_gpu_memcpy(10, gpu_data, recovered_value); for (int i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast<char*>(recovered_value))[i], 2); } delete[] recovered_value; }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: #ifndef CPU_ONLY if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else NO_GPU; #endif break; case HEAD_AT_CPU: case SYNCED: break; } }
// 内部使用的 // 如果当前未初始化直接在GPU分配内存 // 如果当前在CPU,则在GPU上分配内存并且复制到GPU // 如果数据已经在GPU则啥也不做 inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: // 获取设备 CUDA_CHECK(cudaGetDevice(&gpu_device_)); // 在设备上分配内存 CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); // 初始化为0 caffe_gpu_memset(size_, 0, gpu_ptr_); head_ = HEAD_AT_GPU; own_gpu_data_ = true; break; case HEAD_AT_CPU: if (gpu_ptr_ == NULL) { CUDA_CHECK(cudaGetDevice(&gpu_device_)); CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); own_gpu_data_ = true; } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }
// 内部使用的 // 如果当前未初始化,直接在内存分配空间 // 如果在GPU上则复制到内存 // 如果已经在内存则啥都不动 inline void SyncedMemory::to_cpu() { switch (head_) { // 如果当前是未初始化,直接分配CPU上的内存 case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: #ifndef CPU_ONLY // 如果当前数据在GPU,然后cpu_ptr为空 if (cpu_ptr_ == NULL) { // 分配内存 CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); own_cpu_data_ = true; } // 复制数据 caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else// CPU_ONLY模式当然只能报错了 NO_GPU; #endif break; case HEAD_AT_CPU: case SYNCED: break; } }
int check_nan_error<double>(const int n, const double* M){ double *temp = new double[n]; caffe_gpu_memcpy(n * sizeof(double), M, temp); for(int i = 0;i < n;++i){ if(temp[i] != temp[i]){ delete [] temp; return i; } } delete [] temp; return -1; }
int check_nan_error<float>(const int n, const float* M){ float *temp = new float[n]; caffe_gpu_memcpy(n * sizeof(float), M, temp); for(int i = 0;i < n;++i){ if(temp[i] != temp[i]){ delete [] temp; return i; } } delete [] temp; return -1; }
void print_gpu_matrix<double>(const double* M, int row, int col, int row_end, int col_end){ int size = row * col; double *temp = new double[size]; caffe_gpu_memcpy(size * sizeof(double), M, temp); string line; for(int i = 0;i < row_end;++i){ line = ""; for(int j = 0;j < col_end;++j){ line += patch::to_string(temp[i * col + j]) + " "; } LOG(INFO) << line; } delete [] temp; }
void print_gpu_matrix<float>(const float* M, int row, int col, int row_start, int row_end, int col_start, int col_end){ int size = row * col; float *temp = new float[size]; caffe_gpu_memcpy(size * sizeof(float), M, temp); string line; for(int i = row_start;i < row_end;++i){ line = ""; for(int j = col_start;j < col_end;++j){ line += patch::to_string(temp[i * col + j]) + " "; } LOG(INFO) << line; } delete [] temp; }
void write_to_file<double>(string filename, const int R, const int C, const double* A){ std::ofstream output_file(filename.c_str(), std::ios::out); double* temp = new double[R * C]; caffe_gpu_memcpy(R * C * sizeof(double), A, temp); for(int i = 0;i < R;++i){ for(int j = 0;j < C - 1;++j){ output_file << temp[i * C + j] << ","; } output_file << temp[i * C + C - 1] << "\r\n"; } output_file.close(); delete [] temp; }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: { CaffeMallocHost(&cpu_ptr_, size_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; } case HEAD_AT_GPU: { #ifndef CPU_ONLY if (cpu_ptr_ == nullptr) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); greentea_gpu_memcpy(size_, (cl_mem) gpu_ptr_, 0, cpu_ptr_, &ctx); ctx.get_queue().finish(); #endif } head_ = SYNCED; #else NO_GPU; #endif // !CPU_ONLY break; } case HEAD_AT_CPU: case SYNCED: break; } }
inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); CUDA_CHECK(cudaMemset(gpu_ptr_, 0, size_)); head_ = HEAD_AT_GPU; break; case HEAD_AT_CPU: if (gpu_ptr_ == NULL) { CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }
inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED: { if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); device_context_->IncreaseMemoryUsage(size_); caffe_gpu_memset(size_, 0, gpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); ctx.get_queue().finish(); cl_int err; if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, nullptr, &err); } else { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, size_, nullptr, &err); } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; device_context_->IncreaseMemoryUsage(size_); int alpha = 0; greentea_memset(device_context_->id(), size_, alpha, cl_gpu_mem_, 0); gpu_ptr_ = reinterpret_cast<void*>(cl_gpu_mem_); ctx.get_queue().finish(); #endif // USE_GREENTEA } head_ = HEAD_AT_GPU; break; } case HEAD_AT_CPU: { if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA if (gpu_ptr_ == nullptr) { CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); device_context_->IncreaseMemoryUsage(size_); } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); ctx.get_queue().finish(); if (gpu_ptr_ == nullptr) { cl_int err; if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { cl_gpu_mem_ = clCreateBuffer( ctx.handle().get(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, nullptr, &err); } else { cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, size_, nullptr, &err); } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; device_context_->IncreaseMemoryUsage(size_); gpu_ptr_ = reinterpret_cast<void*>(cl_gpu_mem_); ctx.get_queue().finish(); } greentea_gpu_memcpy(size_, cpu_ptr_, (cl_mem) gpu_ptr_, 0, &ctx); ctx.get_queue().finish(); #endif // USE_GREENTEA } head_ = SYNCED; break; } case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU; #endif }