//////////////////////////// // kernel caller definitions bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows) { int nOctaveLayers = params->nOctaveLayers; const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave); size_t localThreads[] = {16, 16}; size_t globalThreads[] = { divUp(max_samples_j, (int)localThreads[0]) * localThreads[0], divUp(max_samples_i, (int)localThreads[1]) * localThreads[1] * (nOctaveLayers + 2) }; ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::xfeatures2d::surf_oclsrc, kerOpts); if(haveImageSupport) { kerCalcDetTrace.args(sumTex, img_rows, img_cols, nOctaveLayers, octave, c_layer_rows, ocl::KernelArg::WriteOnlyNoSize(det), ocl::KernelArg::WriteOnlyNoSize(trace)); } else { kerCalcDetTrace.args(ocl::KernelArg::ReadOnlyNoSize(sum), img_rows, img_cols, nOctaveLayers, octave, c_layer_rows, ocl::KernelArg::WriteOnlyNoSize(det), ocl::KernelArg::WriteOnlyNoSize(trace)); } return kerCalcDetTrace.run(2, globalThreads, localThreads, true); }
static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int templateWindowSize, int searchWindowSize) { int type = _src.type(), cn = CV_MAT_CN(type); Size size = _src.size(); if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 ) return false; int templateWindowHalfWize = templateWindowSize / 2; int searchWindowHalfSize = searchWindowSize / 2; templateWindowSize = templateWindowHalfWize * 2 + 1; searchWindowSize = searchWindowHalfSize * 2 + 1; int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS); int almostTemplateWindowSizeSqBinShift = -1; char cvt[2][40]; String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s", templateWindowSize, searchWindowSize, ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE, templateWindowHalfWize, searchWindowHalfSize, ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); if (k.empty()) return false; UMat almostDist2Weight; if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn, almostTemplateWindowSizeSqBinShift)) return false; CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); UMat srcex; int borderSize = searchWindowHalfSize + templateWindowHalfWize; copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); _dst.create(size, type); UMat dst = _dst.getUMat(); int searchWindowSizeSq = searchWindowSize * searchWindowSize; Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy); UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); srcex = srcex(Rect(Point(borderSize, borderSize), size)); k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; return k.run(2, globalsize, localsize, false); }
__host__ void gridPyrDown_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null()) { const int rows = getRows(src); const int cols = getCols(src); dst.create(divUp(rows, 2), divUp(cols, 2)); pyramids_detail::pyrDown<Brd>(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream)); }
__host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { const dim3 block(Policy::block_size_x, Policy::block_size_y); const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y)); reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); }
static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits<BinOp> ft; const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream) { dim3 threads(16, 16, 1); dim3 grid(1, 1, 1); grid.x = divUp(src1.cols, threads.x); grid.y = divUp(src1.rows, threads.y); transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
// provide additional methods for the user to interact with the command queue after a task is fired static void openCLExecuteKernel_2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, char *build_options, FLUSH_MODE finish_mode) { //construct kernel name //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char) std::stringstream idxStr; if(channels != -1) idxStr << "_C" << channels; if(depth != -1) idxStr << "_D" << depth; kernelName += idxStr.str(); cl_kernel kernel; kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options); if ( localThreads != NULL) { globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0]; globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); } for(size_t i = 0; i < args.size(); i ++) openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, localThreads, 0, NULL, NULL)); switch(finish_mode) { case CLFINISH: clFinish(clCxt->impl->clCmdQueue); case CLFLUSH: clFlush(clCxt->impl->clCmdQueue); break; case DISABLE: default: break; } openCLSafeCall(clReleaseKernel(kernel)); }
//////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////split///////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// void split_vector_run_no_roi(const oclMat &mat_src, oclMat *mat_dst) { Context *clCxt = mat_src.clCxt; int channels = mat_src.channels(); int depth = mat_src.depth(); string kernelName = "split_vector"; int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0}, {8, 8, 8, 8, 4, 4, 2}, {8, 8, 8, 8 , 4, 4, 4}, {4, 4, 2, 2, 1, 1, 1} }; size_t index = indexes[channels-1][mat_dst[0].depth()]; int cols = divUp(mat_src.cols, index); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], divUp(mat_src.rows, localThreads[1]) * localThreads[1], 1 }; vector<pair<size_t , const void *> > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].step)); if(channels >= 3) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].step)); } if(channels >= 4) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].step)); } openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth); }
size_t SampleConverterBase::targetSize(size_t sourceSize) { // we round up on conversion size_t numSamples = divUp(sourceSize, (size_t)mSourceSampleSize); if (numSamples > SIZE_MAX / mTargetSampleSize) { ALOGW("limiting target size due to overflow (%zu*%zu/%zu)", sourceSize, mTargetSampleSize, mSourceSampleSize); return SIZE_MAX; } return numSamples * mTargetSampleSize; }
__host__ void histogram(const SrcPtr& src, ResType* hist, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { const dim3 block(Policy::block_size_x, Policy::block_size_y); const dim3 grid(divUp(rows, block.y)); const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y; histogram<BIN_COUNT, BLOCK_SIZE><<<grid, block, 0, stream>>>(src, hist, mask, rows, cols); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); }
__host__ void reduceToRow(const SrcPtr& src, ResType* dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { const int BLOCK_SIZE_X = 16; const int BLOCK_SIZE_Y = 16; const dim3 block(BLOCK_SIZE_X, BLOCK_SIZE_Y); const dim3 grid(divUp(cols, block.x)); reduceToRow<Reductor, BLOCK_SIZE_X, BLOCK_SIZE_Y><<<grid, block, 0, stream>>>(src, dst, mask, rows, cols); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); }
bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols) { const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; int nOctaveLayers = params->nOctaveLayers; size_t localThreads[3] = {16, 16}; size_t globalThreads[3] = { divUp(layer_cols - 2 * min_margin, (int)localThreads[0] - 2) * localThreads[0], divUp(layer_rows - 2 * min_margin, (int)localThreads[1] - 2) * nOctaveLayers * localThreads[1] }; ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::xfeatures2d::surf_oclsrc, kerOpts); return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det), ocl::KernelArg::ReadOnlyNoSize(trace), ocl::KernelArg::PtrReadWrite(maxPosBuffer), ocl::KernelArg::PtrReadWrite(counters), counterOffset, img_rows, img_cols, octave, nOctaveLayers, layer_rows, layer_cols, maxCandidates, (float)params->hessianThreshold).run(2, globalThreads, localThreads, true); }
static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst) { if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F) { CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n"); return; } Context *clCxt = mat_dst.clCxt; int channels = mat_dst.oclchannels(); int depth = mat_dst.depth(); String kernelName = "merge_vector"; int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0}, {2, 2, 1, 1, 1, 1, 1}, {4, 4, 2, 2 , 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1} }; size_t vector_length = vector_lengths[channels - 1][depth]; int offset_cols = (mat_dst.offset / mat_dst.elemSize()) & (vector_length - 1); int cols = divUp(mat_dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { cols, mat_dst.rows, 1 }; int dst_step1 = mat_dst.cols * mat_dst.elemSize(); std::vector<std::pair<size_t , const void *> > args; args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst.data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.offset)); args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[0].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].offset)); args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[1].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].offset)); if(channels == 4) { args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); if(n == 3) { args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); } else if( n == 4) { args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[3].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].offset)); } } args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.rows)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1)); openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth); }
void ShapeApp::setup() { // setup kinect //============================================================================= kinect.setRegistration(true); kinect.init(); if (kinect.open()) { kinect_on = true; } else { kinect_on = false; } while (!kinect.isConnected()); ofSetFrameRate(30); TIME_SAMPLE_SET_FRAMERATE(30.0f); // setup UI //============================================================================= setupUI(); // setup tracking data objects //============================================================================= // CPU curr_f.init(DEPTH_X_RES, DEPTH_Y_RES); curr_f.allocateHost(); new_f.init(DEPTH_X_RES, DEPTH_Y_RES); new_f.allocateHost(); est_f.init(DEPTH_X_RES, DEPTH_Y_RES); est_f.allocateHost(); view_f.init(DEPTH_X_RES, DEPTH_Y_RES); view_f.allocateHost(); image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR); view_image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR); est_image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR); // GPU curr_f.allocateDevice(); new_f.allocateDevice(); est_f.allocateDevice(); view_f.allocateDevice(); // data for ICP //============================================================================= // GPU corresp.blocks_x = divUp(DEPTH_X_RES, CORRESPONDENCE_BLOCK_X); corresp.blocks_y = divUp(DEPTH_Y_RES, CORRESPONDENCE_BLOCK_Y); corresp.blocks_n = corresp.blocks_x * corresp.blocks_y; corresp.AtA_dev_size = AtA_SIZE * corresp.blocks_n; corresp.Atb_dev_size = Atb_SIZE * corresp.blocks_n; cudaMalloc((void **) &corresp.AtA_dev, corresp.AtA_dev_size * sizeof(float)); cudaMalloc((void **) &corresp.Atb_dev, corresp.Atb_dev_size * sizeof(float)); cudaMalloc((void **) &corresp.points_dev, curr_f.host.points_bn); // CPU corresp.AtA_host = (float *)malloc(corresp.AtA_dev_size * sizeof(float)); corresp.Atb_host = (float *)malloc(corresp.Atb_dev_size * sizeof(float)); corresp.AtA_sum = (float *)malloc(AtA_SIZE * sizeof(float)); corresp.Atb_sum = (float *)malloc(Atb_SIZE * sizeof(float)); correspondence_host = (float *) malloc(curr_f.host.points_bn); correspondence_dev = (float *) malloc(curr_f.host.points_bn); corresp.points_host = (float *)malloc(curr_f.host.points_bn); // voxel data //============================================================================= // CPU min.set(-0.5, -0.5, -1.5); max.set(0.5, 0.5, -0.5); voxels.min = min; voxels.side_n = 256; voxels.size = (max - min) / (float)voxels.side_n; voxels.array_size = voxels.side_n * voxels.side_n * voxels.side_n; voxels.bytes_n = sizeof(float) * voxels.array_size; voxels.data = (float *)malloc(voxels.bytes_n); voxels.w_bytes_n = sizeof(unsigned char) * voxels.array_size; voxels.w_data = (unsigned char *)malloc(voxels.w_bytes_n); // GPU cudaMalloc((void **) &camera_opt.t, sizeof(float) * 16); cudaMalloc((void **) &camera_opt.it, sizeof(float) * 16); camera_opt.ref_pix_size = kinect.getRefPixelSize(); camera_opt.ref_distance = kinect.getRefDistance(); setFloat3(camera_opt.min, min); setFloat3(camera_opt.max, max); cudaMalloc((void **) &dev_voxels.data, voxels.bytes_n); cudaMalloc((void **) &dev_voxels.w_data, voxels.w_bytes_n); setFloat3(&dev_voxels.min, voxels.min); setFloat3(&dev_voxels.size, voxels.size); dev_voxels.side_n = voxels.side_n; dev_voxels.side_n2 = dev_voxels.side_n * dev_voxels.side_n; dev_voxels.array_size = voxels.array_size; dev_voxels.bytes_n = voxels.bytes_n; dev_voxels.w_bytes_n = voxels.w_bytes_n; resetVoxels(); }
void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst) { if(mat_dst.clCxt -> impl -> double_support ==0 && mat_dst.type() == CV_64F) { CV_Error(CV_GpuNotSupported,"Selected device don't support double\r\n"); return; } Context *clCxt = mat_dst.clCxt; int channels = mat_dst.channels(); int depth = mat_dst.depth(); string kernelName = "merge_vector"; int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0}, {2, 2, 1, 1, 1, 1, 1}, {4, 4, 2, 2 , 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1} }; size_t vector_length = vector_lengths[channels-1][depth]; int offset_cols = (mat_dst.offset / mat_dst.elemSize()) & (vector_length - 1); int cols = divUp(mat_dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], divUp(mat_dst.rows, localThreads[1]) * localThreads[1], 1 }; int dst_step1 = mat_dst.cols * mat_dst.elemSize(); vector<pair<size_t , const void *> > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.offset)); args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[0].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].offset)); args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[1].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].offset)); if(channels == 4) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); // if channel == 3, then the matrix will convert to channel =4 //if(n == 3) // args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols)); if(n == 3) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); } else if( n== 4) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].offset)); } } args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1)); openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth); }
void processNet(std::string weights, std::string proto, std::string halide_scheduler, const Mat& input, const std::string& outputLayer, const std::string& framework) { if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL) { #if defined(HAVE_OPENCL) if (!cv::ocl::useOpenCL()) #endif { throw cvtest::SkipTestException("OpenCL is not available/disabled in OpenCV"); } } if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) throw SkipTestException("Skip OpenCL target of Inference Engine backend"); randu(input, 0.0f, 1.0f); weights = findDataFile(weights, false); if (!proto.empty()) proto = findDataFile(proto, false); if (backend == DNN_BACKEND_HALIDE) { if (halide_scheduler == "disabled") throw cvtest::SkipTestException("Halide test is disabled"); if (!halide_scheduler.empty()) halide_scheduler = findDataFile(std::string("dnn/halide_scheduler_") + (target == DNN_TARGET_OPENCL ? "opencl_" : "") + halide_scheduler, true); } if (framework == "caffe") { net = cv::dnn::readNetFromCaffe(proto, weights); } else if (framework == "torch") { net = cv::dnn::readNetFromTorch(weights); } else if (framework == "tensorflow") { net = cv::dnn::readNetFromTensorflow(weights, proto); } else CV_Error(Error::StsNotImplemented, "Unknown framework " + framework); net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false)); net.setPreferableBackend(backend); net.setPreferableTarget(target); if (backend == DNN_BACKEND_HALIDE) { net.setHalideScheduler(halide_scheduler); } MatShape netInputShape = shape(1, 3, input.rows, input.cols); size_t weightsMemory = 0, blobsMemory = 0; net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory); int64 flops = net.getFLOPS(netInputShape); CV_Assert(flops > 0); net.forward(outputLayer); // warmup std::cout << "Memory consumption:" << std::endl; std::cout << " Weights(parameters): " << divUp(weightsMemory, 1u<<20) << " Mb" << std::endl; std::cout << " Blobs: " << divUp(blobsMemory, 1u<<20) << " Mb" << std::endl; std::cout << "Calculation complexity: " << flops * 1e-9 << " GFlops" << std::endl; PERF_SAMPLE_BEGIN() net.forward(); PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); }
static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst) { if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F) { CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n"); return; } Context *clCxt = mat_src.clCxt; int channels = mat_src.oclchannels(); int depth = mat_src.depth(); String kernelName = "split_vector"; int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0}, {4, 4, 2, 2, 1, 1, 1}, {4, 4, 2, 2 , 1, 1, 1}, {4, 4, 2, 2, 1, 1, 1} }; size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()]; int max_offset_cols = 0; for(int i = 0; i < channels; i++) { int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1); if(max_offset_cols < offset_cols) max_offset_cols = offset_cols; } int cols = vector_length == 1 ? divUp(mat_src.cols, vector_length) : divUp(mat_src.cols + max_offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { cols, mat_src.rows, 1 }; int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize(); std::vector<std::pair<size_t , const void *> > args; args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset)); args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset)); args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset)); if(channels >= 3) { args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset)); } if(channels >= 4) { args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset)); } args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1)); openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth); }