inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: #ifndef CPU_ONLY if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else NO_GPU; #endif break; case HEAD_AT_CPU: case SYNCED: break; } }
// 内部使用的 // 如果当前未初始化,直接在内存分配空间 // 如果在GPU上则复制到内存 // 如果已经在内存则啥都不动 inline void SyncedMemory::to_cpu() { switch (head_) { // 如果当前是未初始化,直接分配CPU上的内存 case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: #ifndef CPU_ONLY // 如果当前数据在GPU,然后cpu_ptr为空 if (cpu_ptr_ == NULL) { // 分配内存 CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); own_cpu_data_ = true; } // 复制数据 caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else// CPU_ONLY模式当然只能报错了 NO_GPU; #endif break; case HEAD_AT_CPU: case SYNCED: break; } }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; break; case HEAD_AT_GPU: if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); } CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost)); head_ = SYNCED; break; case HEAD_AT_CPU: case SYNCED: break; } }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_CPU: break; } }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: { CaffeMallocHost(&cpu_ptr_, size_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; } case HEAD_AT_GPU: { #ifndef CPU_ONLY if (cpu_ptr_ == nullptr) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); greentea_gpu_memcpy(size_, (cl_mem) gpu_ptr_, 0, cpu_ptr_, &ctx); ctx.get_queue().finish(); #endif } head_ = SYNCED; #else NO_GPU; #endif // !CPU_ONLY break; } case HEAD_AT_CPU: case SYNCED: break; } }
Dtype* Blob<Dtype>::ReadPSTable(const int clock) const { CHECK(global_table_ptr_); void* data_temp; CaffeMallocHost(&data_temp, capacity_ * sizeof(Dtype)); Dtype* data = (Dtype*)data_temp; vector<vector<Dtype> > row_caches(util::Context::num_rows_per_table()); for (int r_idx = 0; r_idx < util::Context::num_rows_per_table(); ++r_idx) { row_caches[r_idx].resize(global_table_row_capacity_); petuum::RowAccessor row_acc; //LOG(INFO) << "get clock " << clock << " count " << count_ << " height " << height_ << " width " << width_ << " channel " << channels_ << " num " << num_; const auto& r = global_table_ptr_->template Get<petuum::DenseRow<Dtype> >( r_idx, &row_acc, clock); r.CopyToVector(&row_caches[r_idx]); //LOG(INFO) << "get done"; } int data_idx = 0; for (int r_idx = 0; r_idx < util::Context::num_rows_per_table(); ++r_idx) { for (int i = 0; i < global_table_row_capacity_; ++i) { data[data_idx] = row_caches[r_idx][i]; ++data_idx; if (data_idx >= count_) { break; } } if (data_idx >= count_) { break; } } // release memory for (int r_idx = 0; r_idx < util::Context::num_rows_per_table(); ++r_idx) { vector<Dtype>().swap(row_caches[r_idx]); } vector<vector<Dtype> >().swap(row_caches); return data; }