void InternalThread::StartInternalThread() { // TODO switch to failing once Caffe prefetch thread is persistent. // Threads should not be started and stopped repeatedly. // CHECK(!is_started()); StopInternalThread(); #ifndef CPU_ONLY CUDA_CHECK(cudaGetDevice(&device_)); #endif mode_ = Caffe::mode(); rand_seed_ = caffe_rng_rand(); solver_count_ = Caffe::solver_count(); root_solver_ = Caffe::root_solver(); try { thread_.reset(new boost::thread(&InternalThread::entry, this)); } catch (std::exception& e) { CHECK(false) << e.what(); } }
void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { const dim_type nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const dim_type ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice)); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); if (nimages > 1) { blocks.x *= nimages; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms); } else {
float Timer::MilliSeconds() { if (!has_run_at_least_once()) { LOG(WARNING) << "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); } if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_)); #else NO_GPU; #endif } else { elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); } return elapsed_milliseconds_; }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensor4dDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensor4dDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } }
Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) CUDA_NOT_SUPPORTED("Rectangular masks are not supported"); if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported"); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync( kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getActiveStream())); if (isDilation) kernel::morph<T, true>(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { // If there are more than one openmp thread (we are in active region) // then checking Caffe::mode can create additional GPU Context // if ( #ifdef _OPENMP (omp_in_parallel() == 0) && #endif (Caffe::mode() == Caffe::GPU)) { #ifndef CPU_ONLY // NOLINT_NEXT_LINE(caffe/alt_fn) CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); #else NO_GPU; #endif } else { caffe_cpu_copy<Dtype>(N, X, Y); } } }
// 把数据放到cpu上 inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost)); head_ = SYNCED; break; case HEAD_AT_CPU: case SYNCED: break; } }
/* // Launch GPU kernel of normalize // // API // int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out, CUstream stream); // INPUT // alfa // dev_map_in // dev_norm // stream // OUTPUT // dev_map_out // RESULT // Error status */ int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out, CUstream stream) { int sizeX, sizeY; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; void *normalize_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY, (void *) &alfa, }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = NUM_SECTOR * 2; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, normalize_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)"); return LATENT_SVM_OK; }
/* // Launch GPU kernel of PCA feature maps // // API // int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by, CUstream stream); // INPUT // dev_map_in // bx // by // stream // OUTPUT // dev_map_out // RESULT // Error status */ int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by, CUstream stream) { int sizeX, sizeY, p; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; p = dev_map_in->numFeatures; void *pca_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = 1; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, pca_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)"); return LATENT_SVM_OK; }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_); CHECK(cpu_ptr_ != 0) << "size " << size_; memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; break; #if 0 case HEAD_AT_GPU: if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_); } CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost)); head_ = SYNCED; break; #endif case HEAD_AT_CPU: case SYNCED: break; } }
T* pinnedAlloc(const size_t &elements) { managerInit(); T* ptr = NULL; // Allocate the higher megabyte. Overhead of creating pinned memory is // more so we want more resuable memory. size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576; if (elements > 0) { // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) { pinnedGarbageCollect(); } for(mem_iter iter = pinned_maps.begin(); iter != pinned_maps.end(); ++iter) { mem_info info = iter->second; if (info.is_free && info.bytes == alloc_bytes) { iter->second.is_free = false; pinned_used_bytes += alloc_bytes; return (T *)iter->first; } } // Perform garbage collection if memory can not be allocated if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) { pinnedGarbageCollect(); CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes)); } mem_info info = {false, false, alloc_bytes}; pinned_maps[ptr] = info; pinned_used_bytes += alloc_bytes; } return (T*)ptr; }
float Timer::MicroSeconds() { if (!has_run_at_least_once()) { LOG(WARNING)<< "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); } #ifdef USE_CUDA if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_)); // Cuda only measure milliseconds elapsed_microseconds_ = elapsed_milliseconds_ * 1000; } else { #endif elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); #ifdef USE_CUDA } #endif return elapsed_microseconds_; }
/* // Launch GPU kernel of calculate norm // // API //int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CUstream stream) // INPUT // dev_map_in // stream // OUTPUT // dev_norm // RESULT // Error status */ int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in, CvLSVMFeatureMapGPU *dev_norm, CUstream stream) { int sizeX, sizeY, xp; int thread_num_x, thread_num_y, thread_num_z; int block_num_x, block_num_y, block_num_z; int sharedMemBytes; CUresult res; sizeX = dev_map_in->sizeX; sizeY = dev_map_in->sizeY; xp = dev_map_in->numFeatures; void *calc_norm_kernel_arg[] = { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX, (void *) &sizeY, (void *) &xp, }; thread_num_x = (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num); thread_num_y = (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num); thread_num_z = 1; block_num_x = sizeX / thread_num_x; block_num_y = sizeY / thread_num_y; block_num_z = 1; if (sizeX % thread_num_x != 0) block_num_x++; if (sizeY % thread_num_y != 0) block_num_y++; sharedMemBytes = 0; res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z, sharedMemBytes, stream, calc_norm_kernel_arg, NULL); CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)"); return LATENT_SVM_OK; }
void MPIComm::ThreadFunc(int device){ #ifndef CPU_ONLY //LOG(ERROR)<<"device_id is "<<device; CUDA_CHECK(cudaSetDevice(device)); #endif started_.store(true); MPIJob job; while (true){ mutex::scoped_lock lock(queue_mutex_); while( task_queue_.empty() && IsRunning()){ DLOG(INFO)<<"no job running, waiting on cond"; cond_work_.wait(lock); } lock.unlock(); DLOG(INFO)<<"Cond fulfilled, dispatching job"; if (IsRunning()){ job = task_queue_.front(); DLOG(INFO)<<task_queue_.size(); DispatchJob(job); mutex::scoped_lock pop_lock(queue_mutex_); task_queue_.pop(); pop_lock.unlock(); cond_finish_.notify_one(); DLOG(INFO)<<"job finished, poped taskqueue"; }else{ break; } } // finish remaining jobs while (!task_queue_.empty()){ boost::lock_guard<mutex> lock(queue_mutex_); job = task_queue_.front(); task_queue_.pop(); DispatchJob(job); } }
Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE); if (mdims[0] > 19) AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); if (isDilation) kernel::morph<T, true >(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
T* memAlloc(const size_t &elements) { int n = getActiveDeviceId(); T* ptr = NULL; size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024; if (elements > 0) { // FIXME: Add better checks for garbage collection // Perhaps look at total memory available as a metric if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) { garbageCollect(); } for(mem_iter iter = memory_maps[n].begin(); iter != memory_maps[n].end(); iter++) { mem_info info = iter->second; if (info.is_free && info.bytes == alloc_bytes) { iter->second.is_free = false; used_bytes += alloc_bytes; return (T *)iter->first; } } // Perform garbage collection if memory can not be allocated if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) { garbageCollect(); CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes)); } mem_info info = {false, alloc_bytes}; memory_maps[n][ptr] = info; used_bytes += alloc_bytes; } return ptr; }
SocketBuffer* SocketBuffer::Read(bool data) { // Pop the message from local queue QueuedMessage* qm = NULL; if(data) { qm = reinterpret_cast<QueuedMessage*> (this->channel_->receive_queue.pop()); #ifndef CPU_ONLY // Copy the received buffer to GPU memory CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer, // NOLINT(caffe/alt_fn) qm->size, cudaMemcpyHostToDevice)); // NOLINT(caffe/alt_fn) #else //caffe_copy(qm->size, qm->buffer, this->addr_); memcpy(this->addr_, qm->buffer, qm->size); #endif } else { qm = reinterpret_cast<QueuedMessage*> (this->channel_->receive_queue_ctrl.pop()); } // Free up the buffer and the wrapper object if(data) delete qm->buffer; delete qm; return this; }
P2PSync<Dtype>::~P2PSync() { #ifndef CPU_ONLY int initial_device; CUDA_CHECK(cudaGetDevice(&initial_device)); const int self = solver_->param().device_id(); CUDA_CHECK(cudaSetDevice(self)); if (parent_) { CUDA_CHECK(cudaFree(parent_grads_)); const int peer = parent_->solver_->param().device_id(); int access; CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer)); if (access) { CUDA_CHECK(cudaDeviceDisablePeerAccess(peer)); } } CUDA_CHECK(cudaSetDevice(initial_device)); #endif }
void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) { #ifndef CPU_ONLY vector<int> remaining(devices); // Depth for reduction tree int remaining_depth = static_cast<int>(ceil(log2(remaining.size()))); // Group GPUs by board for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { cudaDeviceProp a, b; CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i])); CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j])); if (a.isMultiGpuBoard && b.isMultiGpuBoard) { if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } } ostringstream s; for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str(); // Group by P2P accessibility remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { for (int j = i + 1; j < remaining.size(); ++j) { int access; CUDA_CHECK( cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j])); if (access) { pairs->push_back(DevicePair(remaining[i], remaining[j])); DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j]; remaining.erase(remaining.begin() + j); break; } } } } s.str(""); for (int i = 0; i < remaining.size(); ++i) { s << (i ? ", " : "") << remaining[i]; } DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str(); // Group remaining remaining_depth = ceil(log2(remaining.size())); for (int d = 0; d < remaining_depth; ++d) { for (int i = 0; i < remaining.size(); ++i) { pairs->push_back(DevicePair(remaining[i], remaining[i + 1])); DLOG(INFO) << "Remaining pair: " << remaining[i] << ":" << remaining[i + 1]; remaining.erase(remaining.begin() + i + 1); } } // Should only be the parent node remaining CHECK_EQ(remaining.size(), 1); pairs->insert(pairs->begin(), DevicePair(-1, remaining[0])); CHECK(pairs->size() == devices.size()); for (int i = 0; i < pairs->size(); ++i) { CHECK((*pairs)[i].parent() != (*pairs)[i].device()); for (int j = i + 1; j < pairs->size(); ++j) { CHECK((*pairs)[i].device() != (*pairs)[j].device()); } } #else NO_GPU; #endif }
GPUParams<Dtype>::~GPUParams() { #ifndef CPU_ONLY CUDA_CHECK(cudaFree(data_)); CUDA_CHECK(cudaFree(diff_)); #endif }
void orb(unsigned* out_feat, float** d_x, float** d_y, float** d_score, float** d_ori, float** d_size, unsigned** d_desc, std::vector<unsigned>& feat_pyr, std::vector<float*>& d_x_pyr, std::vector<float*>& d_y_pyr, std::vector<unsigned>& lvl_best, std::vector<float>& lvl_scl, std::vector<CParam<T> >& img_pyr, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels) { unsigned patch_size = REF_PAT_SIZE; unsigned max_levels = feat_pyr.size(); // In future implementations, the user will be capable of passing his // distribution instead of using the reference one //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice)); std::vector<float*> d_score_pyr(max_levels); std::vector<float*> d_ori_pyr(max_levels); std::vector<float*> d_size_pyr(max_levels); std::vector<unsigned*> d_desc_pyr(max_levels); std::vector<unsigned*> d_idx_pyr(max_levels); unsigned total_feat = 0; // Calculate a separable Gaussian kernel unsigned gauss_len = 9; convAccT* h_gauss = new convAccT[gauss_len]; gaussian1D(h_gauss, gauss_len, 2.f); Param<convAccT> gauss_filter; gauss_filter.dims[0] = gauss_len; gauss_filter.strides[0] = 1; for (int k = 1; k < 4; k++) { gauss_filter.dims[k] = 1; gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1]; } dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3]; gauss_filter.ptr = memAlloc<convAccT>(gauss_elem); CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice)); delete[] h_gauss; for (int i = 0; i < (int)max_levels; i++) { if (feat_pyr[i] == 0 || lvl_best[i] == 0) { if (i > 0) memFree((T*)img_pyr[i].ptr); continue; } unsigned* d_usable_feat = memAlloc<unsigned>(1); CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned))); float* d_x_harris = memAlloc<float>(feat_pyr[i]); float* d_y_harris = memAlloc<float>(feat_pyr[i]); float* d_score_harris = memAlloc<float>(feat_pyr[i]); // Calculate Harris responses // Good block_size >= 7 (must be an odd number) dim3 threads(THREADS_X, THREADS_Y); dim3 blocks(divup(feat_pyr[i], threads.x), 1); harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL, d_x_pyr[i], d_y_pyr[i], NULL, feat_pyr[i], d_usable_feat, img_pyr[i], 7, 0.04f, patch_size); POST_LAUNCH_CHECK(); unsigned usable_feat = 0; CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost)); memFree(d_x_pyr[i]); memFree(d_y_pyr[i]); memFree(d_usable_feat); feat_pyr[i] = usable_feat; if (feat_pyr[i] == 0) { memFree(d_x_harris); memFree(d_y_harris); memFree(d_score_harris); if (i > 0) memFree((T*)img_pyr[i].ptr); continue; } Param<float> harris_sorted; Param<unsigned> harris_idx; harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i]; harris_sorted.strides[0] = harris_idx.strides[0] = 1; for (int k = 1; k < 4; k++) { harris_sorted.dims[k] = 1; harris_sorted.strides[k] = harris_sorted.dims[k - 1] * harris_sorted.strides[k - 1]; harris_idx.dims[k] = 1; harris_idx.strides[k] = harris_idx.dims[k - 1] * harris_idx.strides[k - 1]; } dim_type sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3]; harris_sorted.ptr = d_score_harris; harris_idx.ptr = memAlloc<unsigned>(sort_elem); // Sort features according to Harris responses sort0_index<float, false>(harris_sorted, harris_idx); feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]); float* d_x_lvl = memAlloc<float>(feat_pyr[i]); float* d_y_lvl = memAlloc<float>(feat_pyr[i]); float* d_score_lvl = memAlloc<float>(feat_pyr[i]); // Keep only features with higher Harris responses threads = dim3(THREADS, 1); blocks = dim3(divup(feat_pyr[i], threads.x), 1); keep_features<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_score_lvl, NULL, d_x_harris, d_y_harris, harris_sorted.ptr, harris_idx.ptr, NULL, feat_pyr[i]); POST_LAUNCH_CHECK(); memFree(d_x_harris); memFree(d_y_harris); memFree(harris_sorted.ptr); memFree(harris_idx.ptr); float* d_ori_lvl = memAlloc<float>(feat_pyr[i]); // Compute orientation of features threads = dim3(THREADS_X, THREADS_Y); blocks = dim3(divup(feat_pyr[i], threads.x), 1); centroid_angle<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_ori_lvl, feat_pyr[i], img_pyr[i], patch_size); POST_LAUNCH_CHECK(); Param<T> lvl_tmp; Param<T> lvl_filt; for (int k = 0; k < 4; k++) { lvl_tmp.dims[k] = img_pyr[i].dims[k]; lvl_tmp.strides[k] = img_pyr[i].strides[k]; lvl_filt.dims[k] = img_pyr[i].dims[k]; lvl_filt.strides[k] = img_pyr[i].strides[k]; } dim_type lvl_elem = img_pyr[i].strides[3] * img_pyr[i].dims[3]; lvl_tmp.ptr = memAlloc<T>(lvl_elem); lvl_filt.ptr = memAlloc<T>(lvl_elem); // Separable Gaussian filtering to reduce noise sensitivity convolve2<T, convAccT, 0, false>(lvl_tmp, img_pyr[i], gauss_filter); convolve2<T, convAccT, 1, false>(lvl_filt, CParam<T>(lvl_tmp), gauss_filter); memFree(lvl_tmp.ptr); if (i > 0) { memFree((T*)img_pyr[i].ptr); } img_pyr[i].ptr = lvl_filt.ptr; for (int k = 0; k < 4; k++) { img_pyr[i].dims[k] = lvl_filt.dims[k]; img_pyr[i].strides[k] = lvl_filt.strides[k]; } float* d_size_lvl = memAlloc<float>(feat_pyr[i]); unsigned* d_desc_lvl = memAlloc<unsigned>(feat_pyr[i] * 8); CUDA_CHECK(cudaMemset(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned))); // Compute ORB descriptors threads = dim3(THREADS_X, THREADS_Y); blocks = dim3(divup(feat_pyr[i], threads.x), 1); extract_orb<T><<<blocks, threads>>>(d_desc_lvl, feat_pyr[i], d_x_lvl, d_y_lvl, d_ori_lvl, d_size_lvl, img_pyr[i], lvl_scl[i], patch_size); POST_LAUNCH_CHECK(); memFree((T*)img_pyr[i].ptr); // Store results to pyramids total_feat += feat_pyr[i]; d_x_pyr[i] = d_x_lvl; d_y_pyr[i] = d_y_lvl; d_score_pyr[i] = d_score_lvl; d_ori_pyr[i] = d_ori_lvl; d_size_pyr[i] = d_size_lvl; d_desc_pyr[i] = d_desc_lvl; } memFree((T*)gauss_filter.ptr); // If no features are found, set found features to 0 and return if (total_feat == 0) { *out_feat = 0; return; } // Allocate output memory *d_x = memAlloc<float>(total_feat); *d_y = memAlloc<float>(total_feat); *d_score = memAlloc<float>(total_feat); *d_ori = memAlloc<float>(total_feat); *d_size = memAlloc<float>(total_feat); *d_desc = memAlloc<unsigned>(total_feat * 8); unsigned offset = 0; for (unsigned i = 0; i < max_levels; i++) { if (feat_pyr[i] == 0) continue; if (i > 0) offset += feat_pyr[i-1]; CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned), cudaMemcpyDeviceToDevice)); memFree(d_x_pyr[i]); memFree(d_y_pyr[i]); memFree(d_score_pyr[i]); memFree(d_ori_pyr[i]); memFree(d_size_pyr[i]); memFree(d_desc_pyr[i]); } // Sets number of output features *out_feat = total_feat; }
/* // Getting feature map for the selected subimage in GPU // // API //int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) // INPUT // numStep // k // devs_img // streams // OUTPUT // devs_map // RESULT // Error status */ int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) { int sizeX, sizeY; int p, px; int height, width; int i, j; int *nearest; float *w, a_x, b_x; int size_r, size_alfa, size_nearest, size_w, size_map; CUresult res; CvLSVMFeatureMapGPU **devs_r, **devs_alfa; CUdeviceptr dev_nearest, dev_w; px = 3 * NUM_SECTOR; p = px; size_nearest = k; size_w = k * 2; devs_r = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); devs_alfa = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); nearest = (int *) malloc(sizeof(int) * size_nearest); w = (float *) malloc(sizeof(float) * size_w); // initialize "nearest" and "w" for (i = 0; i < k / 2; i++) { nearest[i] = -1; }/*for(i = 0; i < k / 2; i++)*/ for (i = k / 2; i < k; i++) { nearest[i] = 1; }/*for(i = k / 2; i < k; i++)*/ for (j = 0; j < k / 2; j++) { b_x = k / 2 + j + 0.5f; a_x = k / 2 - j - 0.5f; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = 0; j < k / 2; j++)*/ for (j = k / 2; j < k; j++) { a_x = j - k / 2 + 0.5f; b_x = -j + k / 2 - 0.5f + k; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = k / 2; j < k; j++)*/ res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest); CUDA_CHECK(res, "cuMemAlloc(dev_nearest)"); res = cuMemAlloc(&dev_w, sizeof(float) * size_w); CUDA_CHECK(res, "cuMemAlloc(dev_w)"); res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest, streams[numStep - 1]); res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w, streams[numStep - 1]); // allocate device memory for (i = 0; i < numStep; i++) { width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1); allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2); } // excute async for (i = 0; i < numStep; i++) { // initialize "map", "r" and "alfa" width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; sizeX = width / k; sizeY = height / k; size_map = sizeX * sizeY * p; size_r = width * height; size_alfa = width * height * 2; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map[i]->map, 0, size_map, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map)"); res = cuMemsetD32Async(devs_r[i]->map, 0, size_r, streams[i]); CUDA_CHECK(res, "cuMemset(dev_r)"); res = cuMemsetD32Async(devs_alfa[i]->map, 0, size_alfa, streams[i]); CUDA_CHECK(res, "cuMemset(dev_alfa)"); // launch kernel calculateHistogramGPULaunch(k, devs_img[i], devs_r[i], devs_alfa[i], streams[i]); } for (i = 0; i < numStep; i++) { getFeatureMapsGPULaunch(k, devs_r[i], devs_alfa[i], &dev_nearest, &dev_w, devs_map[i], streams[i]); } // free device memory res = cuMemFree(dev_nearest); CUDA_CHECK(res, "cuMemFree(dev_nearest)"); res = cuMemFree(dev_w); CUDA_CHECK(res, "cuMemFree(dev_w)"); for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_r[i]); freeFeatureMapObjectGPU(&devs_alfa[i]); } free(nearest); free(w); free(devs_r); free(devs_alfa); return LATENT_SVM_OK; }
CudaEvent::CudaEvent( ) : isRecorded( false ), finished( true ), refCounter( 0u ) { log( ggLog::CUDA_RT()+ggLog::MEMORY(), "create event" ); CUDA_CHECK( cudaEventCreateWithFlags( &event, cudaEventDisableTiming ) ); }
/* // Feature map reduction in GPU // In each cell we reduce dimension of the feature vector // according to original paper special procedure // // API //int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams) // INPUT // numStep // bx // by // devs_map_in // streams // OUTPUT // feature_maps // RESULT // Error status */ int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams) { int sizeX, sizeY, pp; int size_map_pca; int i; CUresult res; CvLSVMFeatureMapGPU **devs_map_pca; pp = NUM_SECTOR * 3 + 4; devs_map_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX + 2 * bx; sizeY = devs_map_in[i]->sizeY + 2 * by; size_map_pca = sizeX * sizeY * pp; allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp); allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_pca)"); // launch kernel PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i], devs_map_pca[i], bx, by, streams[i]); } for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // copy memory from device to host res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map, sizeof(float) * size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)"); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_map_pca[i]); } free(devs_map_pca); return LATENT_SVM_OK; }
/* // Feature map Normalization and Truncation in GPU // // API //int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) // INPUT // numStep // alfa // devs_map_in // streams // OUTPUT // devs_map_out // RESULT // Error status */ int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) { int sizeX, sizeY, newSizeX, newSizeY, pp; int size_norm, size_map_out; int i; CUresult res; CvLSVMFeatureMapGPU **devs_norm; pp = NUM_SECTOR * 12; devs_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; size_norm = sizeX * sizeY; size_map_out = newSizeX * newSizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]); CUDA_CHECK(res, "cuMemset(dev_norm)"); res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_out)"); // launch kernel calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]); } for (i = 0; i < numStep; i++) { // launch kernel normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i], streams[i]); } // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_norm[i]); } free(devs_norm); return LATENT_SVM_OK; }
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { init_cpu = false; init_gpu = false; const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param(); num_iterations_ = meanfield_param.num_iterations(); CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1."; theta_alpha_ = meanfield_param.theta_alpha(); theta_beta_ = meanfield_param.theta_beta(); theta_gamma_ = meanfield_param.theta_gamma(); count_ = bottom[0]->count(); num_ = bottom[0]->num(); channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); num_pixels_ = height_ * width_; LOG(INFO) << "This implementation has not been tested batch size > 1."; top[0]->Reshape(num_, channels_, height_, width_); // Initialize the parameters that will updated by backpropagation. if (this->blobs_.size() > 0) { LOG(INFO) << "Multimeanfield layer skipping parameter initialization."; } else { this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix // Allocate space for kernel weights. this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_)); this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data()); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data()); // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available. FILE * pFile; pFile = fopen("spatial.par", "r"); CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); pFile = fopen("bilateral.par", "r"); CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); // Initialize the compatibility matrix. this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data()); // Initialize it to have the Potts model. for (int c = 0; c < channels_; ++c) { (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.); } } float spatial_kernel[2 * num_pixels_]; float *spatial_kernel_gpu_; compute_spatial_kernel(spatial_kernel); spatial_lattice_.reset(new ModifiedPermutohedral()); spatial_norm_.Reshape(1, 1, height_, width_); Dtype* norm_data_gpu ; Dtype* norm_data; // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size. switch (Caffe::mode()) { case Caffe::CPU: norm_data = spatial_norm_.mutable_cpu_data(); spatial_lattice_->init(spatial_kernel, 2, width_, height_); // Calculate spatial filter normalization factors. norm_feed_= new Dtype[num_pixels_]; caffe_set(num_pixels_, Dtype(1.0), norm_feed_); // pass norm_feed and norm_data to gpu spatial_lattice_->compute(norm_data, norm_feed_, 1); bilateral_kernel_buffer_ = new float[5 * num_pixels_]; init_cpu = true; break; #ifndef CPU_ONLY case Caffe::GPU: CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ; spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_); CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ; caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_); norm_data_gpu = spatial_norm_.mutable_gpu_data(); spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); norm_data = spatial_norm_.mutable_cpu_data(); CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaFree(spatial_kernel_gpu_)); init_gpu = true; break; #endif default: LOG(FATAL) << "Unknown caffe mode."; } for (int i = 0; i < num_pixels_; ++i) { norm_data[i] = 1.0f / (norm_data[i] + 1e-20f); } bilateral_norms_.Reshape(num_, 1, height_, width_); // Configure the split layer that is used to make copies of the unary term. One copy for each iteration. // It may be possible to optimize this calculation later. split_layer_bottom_vec_.clear(); split_layer_bottom_vec_.push_back(bottom[0]); split_layer_top_vec_.clear(); split_layer_out_blobs_.resize(num_iterations_); for (int i = 0; i < num_iterations_; i++) { split_layer_out_blobs_[i].reset(new Blob<Dtype>()); split_layer_top_vec_.push_back(split_layer_out_blobs_[i].get()); } LayerParameter split_layer_param; split_layer_.reset(new SplitLayer<Dtype>(split_layer_param)); split_layer_->SetUp(split_layer_bottom_vec_, split_layer_top_vec_); // Make blobs to store outputs of each meanfield iteration. Output of the last iteration is stored in top[0]. // So we need only (num_iterations_ - 1) blobs. iteration_output_blobs_.resize(num_iterations_ - 1); for (int i = 0; i < num_iterations_ - 1; ++i) { iteration_output_blobs_[i].reset(new Blob<Dtype>(num_, channels_, height_, width_)); } // Make instances of MeanfieldIteration and initialize them. meanfield_iterations_.resize(num_iterations_); for (int i = 0; i < num_iterations_; ++i) { meanfield_iterations_[i].reset(new MeanfieldIteration<Dtype>()); meanfield_iterations_[i]->OneTimeSetUp( split_layer_out_blobs_[i].get(), // unary terms (i == 0) ? bottom[1] : iteration_output_blobs_[i - 1].get(), // softmax input (i == num_iterations_ - 1) ? top[0] : iteration_output_blobs_[i].get(), // output blob spatial_lattice_, // spatial lattice &spatial_norm_); // spatial normalization factors. } this->param_propagate_down_.resize(this->blobs_.size(), true); LOG(INFO) << ("MultiStageMeanfieldLayer initialized."); }
/* // Property Message // // API //static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) // INPUT // image // step // numStep // startIndex // sideLength // bx // by // OUTPUT // maps // RESULT // Error status */ static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) { CvLSVMFeatureMap **feature_maps; int i; int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY; float *scales; CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca; CUstream *streams; CUresult res; scales = (float *) malloc(sizeof(float) * (numStep)); devs_img = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); streams = (CUstream *) malloc(sizeof(CUstream) * (numStep)); feature_maps = (CvLSVMFeatureMap **) malloc( sizeof(CvLSVMFeatureMap *) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { scales[i] = 1.0f / powf(step, (float) i); width = (int) (((float) image->width ) * scales[i] + 0.5); height = (int) (((float) image->height) * scales[i] + 0.5); numChannels = image->nChannels; sizeX = width / sideLength; sizeY = height / sideLength; p = NUM_SECTOR * 3; pp = NUM_SECTOR * 12; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_img[i], width, height, numChannels); allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p); allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX, newSizeY, pp); res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT); CUDA_CHECK(res, "cuStreamCreate(stream)"); } // excute main function resizeGPUStream(numStep, image, scales, devs_img, streams); getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm, streams); normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm, devs_map_pre_pca, streams); PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps, streams); // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); cuStreamDestroy(streams[i]); } for (i = 0; i < numStep; i++) { (*maps)->pyramid[startIndex + i] = feature_maps[i]; }/*for(i = 0; i < numStep; i++)*/ // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_img[i]); freeFeatureMapObjectGPU(&devs_map_pre_norm[i]); freeFeatureMapObjectGPU(&devs_map_pre_pca[i]); } free(scales); free(devs_img); free(devs_map_pre_norm); free(devs_map_pre_pca); free(streams); free(feature_maps); return LATENT_SVM_OK; }
cl_int pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); CUresult result; void *b = NULL; /* if memory for this global memory is not yet allocated -> do it */ if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL) { cl_mem_flags flags = mem_obj->flags; if (flags & CL_MEM_USE_HOST_PTR) { #if defined __arm__ // cuMemHostRegister is not supported on ARN // Allocate device memory and perform explicit copies // before and after running a kernel result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); CUDA_CHECK (result, "cuMemAlloc"); #else result = cuMemHostRegister (host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); if (result != CUDA_SUCCESS && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) CUDA_CHECK (result, "cuMemHostRegister"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); #endif } else if (flags & CL_MEM_ALLOC_HOST_PTR) { result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); CUDA_CHECK (result, "cuMemHostAlloc"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem_obj->mem_host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); } else { result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); if (result != CUDA_SUCCESS) { const char *err; cuGetErrorName (result, &err); POCL_MSG_PRINT2 (__FUNCTION__, __LINE__, "-> Failed to allocate memory: %s\n", err); return CL_MEM_OBJECT_ALLOCATION_FAILURE; } } if (flags & CL_MEM_COPY_HOST_PTR) { result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size); CUDA_CHECK (result, "cuMemcpyHtoD"); } mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b; mem_obj->device_ptrs[device->global_mem_id].global_mem_id = device->global_mem_id; } /* copy already allocated global mem info to devices own slot */ mem_obj->device_ptrs[device->dev_id] = mem_obj->device_ptrs[device->global_mem_id]; return CL_SUCCESS; }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
void Caffe::SetDevice(const int device_id) { root_device_ = device_id; CUDA_CHECK(cudaSetDevice(root_device_)); }