Ejemplo n.º 1
0
////////////////////////////
// kernel caller definitions
bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows)
{
    int nOctaveLayers = params->nOctaveLayers;
    const int min_size = calcSize(octave, 0);
    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);

    size_t localThreads[]  = {16, 16};
    size_t globalThreads[] =
    {
        divUp(max_samples_j, (int)localThreads[0]) * localThreads[0],
        divUp(max_samples_i, (int)localThreads[1]) * localThreads[1] * (nOctaveLayers + 2)
    };
    ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::xfeatures2d::surf_oclsrc, kerOpts);
    if(haveImageSupport)
    {
        kerCalcDetTrace.args(sumTex,
                             img_rows, img_cols, nOctaveLayers,
                             octave, c_layer_rows,
                             ocl::KernelArg::WriteOnlyNoSize(det),
                             ocl::KernelArg::WriteOnlyNoSize(trace));
    }
    else
    {
        kerCalcDetTrace.args(ocl::KernelArg::ReadOnlyNoSize(sum),
                             img_rows, img_cols, nOctaveLayers,
                             octave, c_layer_rows,
                             ocl::KernelArg::WriteOnlyNoSize(det),
                             ocl::KernelArg::WriteOnlyNoSize(trace));
    }
    return kerCalcDetTrace.run(2, globalThreads, localThreads, true);
}
static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                                     int templateWindowSize, int searchWindowSize)
{
    int type = _src.type(), cn = CV_MAT_CN(type);
    Size size = _src.size();

    if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 )
        return false;

    int templateWindowHalfWize = templateWindowSize / 2;
    int searchWindowHalfSize = searchWindowSize / 2;
    templateWindowSize  = templateWindowHalfWize * 2 + 1;
    searchWindowSize = searchWindowHalfSize * 2 + 1;
    int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
    int almostTemplateWindowSizeSqBinShift = -1;

    char cvt[2][40];
    String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                         " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
                         " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s",
                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE,
                         templateWindowHalfWize, searchWindowHalfSize,
                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
                         CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));

    ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
    if (k.empty())
        return false;

    UMat almostDist2Weight;
    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
                                   almostTemplateWindowSizeSqBinShift))
        return false;
    CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);

    UMat srcex;
    int borderSize = searchWindowHalfSize + templateWindowHalfWize;
    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);

    _dst.create(size, type);
    UMat dst = _dst.getUMat();

    int searchWindowSizeSq = searchWindowSize * searchWindowSize;
    Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
    Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
    UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));

    srcex = srcex(Rect(Point(borderSize, borderSize), size));
    k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
           ocl::KernelArg::PtrReadOnly(almostDist2Weight),
           ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);

    size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 };
    return k.run(2, globalsize, localsize, false);
}
Ejemplo n.º 3
0
__host__ void gridPyrDown_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
    const int rows = getRows(src);
    const int cols = getCols(src);

    dst.create(divUp(rows, 2), divUp(cols, 2));

    pyramids_detail::pyrDown<Brd>(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream));
}
Ejemplo n.º 4
0
    __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        const dim3 block(Policy::block_size_x, Policy::block_size_y);
        const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));

        reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
Ejemplo n.º 5
0
            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
            {
                typedef TransformFunctorTraits<BinOp> ft;

                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);

                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );
            }
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
            {
                dim3 threads(16, 16, 1);
                dim3 grid(1, 1, 1);

                grid.x = divUp(src1.cols, threads.x);
                grid.y = divUp(src1.rows, threads.y);        

                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
Ejemplo n.º 7
0
        // provide additional methods for the user to interact with the command queue after a task is fired
        static void openCLExecuteKernel_2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
                                   size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
                                   int depth, char *build_options, FLUSH_MODE finish_mode)
        {
            //construct kernel name
            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
            std::stringstream idxStr;
            if(channels != -1)
                idxStr << "_C" << channels;
            if(depth != -1)
                idxStr << "_D" << depth;
            kernelName += idxStr.str();

            cl_kernel kernel;
            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);

            if ( localThreads != NULL)
            {
                globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
                globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
                globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];

                //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
                cv::ocl::openCLVerifyKernel(clCxt, kernel,  localThreads);
            }
            for(size_t i = 0; i < args.size(); i ++)
                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));

            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
                                                  localThreads, 0, NULL, NULL));

            switch(finish_mode)
            {
            case CLFINISH:
                clFinish(clCxt->impl->clCmdQueue);
            case CLFLUSH:
                clFlush(clCxt->impl->clCmdQueue);
                break;
            case DISABLE:
            default:
                break;
            }
            openCLSafeCall(clReleaseKernel(kernel));
        }
Ejemplo n.º 8
0
            ////////////////////////////////////////////////////////////////////////////////////////////////////
            //////////////////////////////////////split/////////////////////////////////////////////////////////////
            //////////////////////////////////////////////////////////////////////////////////////////////////
            void split_vector_run_no_roi(const oclMat &mat_src, oclMat *mat_dst)
            {
                Context  *clCxt = mat_src.clCxt;
                int channels = mat_src.channels();
                int depth = mat_src.depth();

                string kernelName = "split_vector";

                int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0},
                    {8, 8, 8, 8, 4, 4, 2},
                    {8, 8, 8, 8 , 4, 4, 4},
                    {4, 4, 2, 2, 1, 1, 1}
                };

                size_t index = indexes[channels-1][mat_dst[0].depth()];
                int cols = divUp(mat_src.cols, index);
                size_t localThreads[3]  = { 64, 4, 1 };
                size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
                    divUp(mat_src.rows, localThreads[1]) * localThreads[1],
                    1
                };

                vector<pair<size_t , const void *> > args;
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
                args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
                if(channels >= 3)
                {
                    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
                    args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
                }
                if(channels >= 4)
                {
                    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
                    args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
                }

                openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
            }
Ejemplo n.º 9
0
size_t SampleConverterBase::targetSize(size_t sourceSize) {
    // we round up on conversion
    size_t numSamples = divUp(sourceSize, (size_t)mSourceSampleSize);
    if (numSamples > SIZE_MAX / mTargetSampleSize) {
        ALOGW("limiting target size due to overflow (%zu*%zu/%zu)",
                sourceSize, mTargetSampleSize, mSourceSampleSize);
        return SIZE_MAX;
    }
    return numSamples * mTargetSampleSize;
}
Ejemplo n.º 10
0
    __host__ void histogram(const SrcPtr& src, ResType* hist, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        const dim3 block(Policy::block_size_x, Policy::block_size_y);
        const dim3 grid(divUp(rows, block.y));

        const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y;

        histogram<BIN_COUNT, BLOCK_SIZE><<<grid, block, 0, stream>>>(src, hist, mask, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
Ejemplo n.º 11
0
    __host__ void reduceToRow(const SrcPtr& src, ResType* dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        const int BLOCK_SIZE_X = 16;
        const int BLOCK_SIZE_Y = 16;

        const dim3 block(BLOCK_SIZE_X, BLOCK_SIZE_Y);
        const dim3 grid(divUp(cols, block.x));

        reduceToRow<Reductor, BLOCK_SIZE_X, BLOCK_SIZE_Y><<<grid, block, 0, stream>>>(src, dst, mask, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
Ejemplo n.º 12
0
bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave,
                                 int layer_rows, int layer_cols)
{
    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
    int nOctaveLayers = params->nOctaveLayers;

    size_t localThreads[3]  = {16, 16};
    size_t globalThreads[3] =
    {
        divUp(layer_cols - 2 * min_margin, (int)localThreads[0] - 2) * localThreads[0],
        divUp(layer_rows - 2 * min_margin, (int)localThreads[1] - 2) * nOctaveLayers * localThreads[1]
    };

    ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::xfeatures2d::surf_oclsrc, kerOpts);
    return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det),
                              ocl::KernelArg::ReadOnlyNoSize(trace),
                              ocl::KernelArg::PtrReadWrite(maxPosBuffer),
                              ocl::KernelArg::PtrReadWrite(counters),
                              counterOffset, img_rows, img_cols,
                              octave, nOctaveLayers,
                              layer_rows, layer_cols,
                              maxCandidates,
                              (float)params->hessianThreshold).run(2, globalThreads, localThreads, true);
}
Ejemplo n.º 13
0
            static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
            {
                if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F)
                {
                    CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
                    return;
                }

                Context  *clCxt = mat_dst.clCxt;
                int channels = mat_dst.oclchannels();
                int depth = mat_dst.depth();

                String kernelName = "merge_vector";

                int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
                    {2, 2, 1, 1, 1, 1, 1},
                    {4, 4, 2, 2 , 1, 1, 1},
                    {1, 1, 1, 1, 1, 1, 1}
                };

                size_t vector_length = vector_lengths[channels - 1][depth];
                int offset_cols = (mat_dst.offset / mat_dst.elemSize()) & (vector_length - 1);
                int cols = divUp(mat_dst.cols + offset_cols, vector_length);

                size_t localThreads[3]  = { 64, 4, 1 };
                size_t globalThreads[3] = { cols, mat_dst.rows, 1 };

                int dst_step1 = mat_dst.cols * mat_dst.elemSize();
                std::vector<std::pair<size_t , const void *> > args;
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst.data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.offset));
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[0].data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].offset));
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[1].data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].offset));

                if(channels == 4)
                {
                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));

                    if(n == 3)
                    {
                        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
                    }
                    else if( n == 4)
                    {
                        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[3].data));
                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].step));
                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].offset));
                    }
                }

                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.rows));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));

                openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth);
            }
Ejemplo n.º 14
0
void ShapeApp::setup() {

	// setup kinect
//=============================================================================
	kinect.setRegistration(true);
	kinect.init();
	if (kinect.open()) {
		kinect_on = true;
	} else {
		kinect_on = false;
	}

	while (!kinect.isConnected());

	ofSetFrameRate(30);
	TIME_SAMPLE_SET_FRAMERATE(30.0f);

	// setup UI
//=============================================================================
	setupUI();

	// setup tracking data objects
//=============================================================================
	// CPU
	curr_f.init(DEPTH_X_RES, DEPTH_Y_RES);
	curr_f.allocateHost();

	new_f.init(DEPTH_X_RES, DEPTH_Y_RES);
	new_f.allocateHost();

	est_f.init(DEPTH_X_RES, DEPTH_Y_RES);
	est_f.allocateHost();

	view_f.init(DEPTH_X_RES, DEPTH_Y_RES);
	view_f.allocateHost();

	image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR);
	view_image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR);
	est_image.allocate(DEPTH_X_RES, DEPTH_Y_RES, OF_IMAGE_COLOR);

	// GPU
	curr_f.allocateDevice();
	new_f.allocateDevice();
	est_f.allocateDevice();
	view_f.allocateDevice();

	// data for ICP
//=============================================================================
	// GPU
	corresp.blocks_x =
			divUp(DEPTH_X_RES, CORRESPONDENCE_BLOCK_X);
	corresp.blocks_y =
			divUp(DEPTH_Y_RES, CORRESPONDENCE_BLOCK_Y);
	corresp.blocks_n = corresp.blocks_x * corresp.blocks_y;
	corresp.AtA_dev_size = AtA_SIZE * corresp.blocks_n;
	corresp.Atb_dev_size = Atb_SIZE * corresp.blocks_n;

	cudaMalloc((void **) &corresp.AtA_dev,
			corresp.AtA_dev_size * sizeof(float));
	cudaMalloc((void **) &corresp.Atb_dev,
			corresp.Atb_dev_size * sizeof(float));

	cudaMalloc((void **) &corresp.points_dev, curr_f.host.points_bn);

	// CPU
	corresp.AtA_host = (float *)malloc(corresp.AtA_dev_size * sizeof(float));
	corresp.Atb_host = (float *)malloc(corresp.Atb_dev_size * sizeof(float));

	corresp.AtA_sum = (float *)malloc(AtA_SIZE * sizeof(float));
	corresp.Atb_sum = (float *)malloc(Atb_SIZE * sizeof(float));

	correspondence_host = (float *) malloc(curr_f.host.points_bn);
	correspondence_dev = (float *) malloc(curr_f.host.points_bn);
	corresp.points_host = (float *)malloc(curr_f.host.points_bn);

	// voxel data
//=============================================================================
	// CPU
	min.set(-0.5, -0.5, -1.5);
	max.set(0.5, 0.5, -0.5);

	voxels.min = min;
	voxels.side_n = 256;
	voxels.size = (max - min) / (float)voxels.side_n;

	voxels.array_size = voxels.side_n * voxels.side_n * voxels.side_n;
	voxels.bytes_n = sizeof(float) * voxels.array_size;
	voxels.data = (float *)malloc(voxels.bytes_n);

	voxels.w_bytes_n = sizeof(unsigned char) * voxels.array_size;
	voxels.w_data = (unsigned char *)malloc(voxels.w_bytes_n);

	// GPU
	cudaMalloc((void **) &camera_opt.t, sizeof(float) * 16);
	cudaMalloc((void **) &camera_opt.it, sizeof(float) * 16);
	camera_opt.ref_pix_size = kinect.getRefPixelSize();
	camera_opt.ref_distance = kinect.getRefDistance();
	setFloat3(camera_opt.min, min);
	setFloat3(camera_opt.max, max);

	cudaMalloc((void **) &dev_voxels.data, voxels.bytes_n);
	cudaMalloc((void **) &dev_voxels.w_data, voxels.w_bytes_n);
	setFloat3(&dev_voxels.min, voxels.min);
	setFloat3(&dev_voxels.size, voxels.size);

	dev_voxels.side_n = voxels.side_n;
	dev_voxels.side_n2 = dev_voxels.side_n * dev_voxels.side_n;
	dev_voxels.array_size = voxels.array_size;
	dev_voxels.bytes_n = voxels.bytes_n;
	dev_voxels.w_bytes_n = voxels.w_bytes_n;

	resetVoxels();
}
Ejemplo n.º 15
0
            void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
            {
                if(mat_dst.clCxt -> impl -> double_support ==0 && mat_dst.type() == CV_64F)
                {
                    CV_Error(CV_GpuNotSupported,"Selected device don't support double\r\n");
                    return;
                }

                Context  *clCxt = mat_dst.clCxt;
                int channels = mat_dst.channels();
                int depth = mat_dst.depth();

                string kernelName = "merge_vector";

                int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
                    {2, 2, 1, 1, 1, 1, 1},
                    {4, 4, 2, 2 , 1, 1, 1},
                    {1, 1, 1, 1, 1, 1, 1}
                };

                size_t vector_length = vector_lengths[channels-1][depth];
                int offset_cols = (mat_dst.offset / mat_dst.elemSize()) & (vector_length - 1);
                int cols = divUp(mat_dst.cols + offset_cols, vector_length);

                size_t localThreads[3]  = { 64, 4, 1 };
                size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
                    divUp(mat_dst.rows, localThreads[1]) * localThreads[1],
                    1
                };

                int dst_step1 = mat_dst.cols * mat_dst.elemSize();
                vector<pair<size_t , const void *> > args;
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst.data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.step));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.offset));
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[0].data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].step));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].offset));
                args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[1].data));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].step));
                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].offset));

                if(channels == 4)
                {
                    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
                    args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
                    args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));

                    // if channel == 3, then the matrix will convert to channel =4
                    //if(n == 3)
                     //   args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols));

                    if(n == 3)
                    {
                        args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
                        args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
                        args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
                    }
                    else if( n== 4)
                    {
                        args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data));
                        args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step));
                        args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].offset));
                    }
                }

                args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.rows));
                args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
                args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1));

                openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth);
            }
Ejemplo n.º 16
0
    void processNet(std::string weights, std::string proto, std::string halide_scheduler,
                    const Mat& input, const std::string& outputLayer,
                    const std::string& framework)
    {
        if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
        {
#if defined(HAVE_OPENCL)
            if (!cv::ocl::useOpenCL())
#endif
            {
                throw cvtest::SkipTestException("OpenCL is not available/disabled in OpenCV");
            }
        }
        if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
            throw SkipTestException("Skip OpenCL target of Inference Engine backend");

        randu(input, 0.0f, 1.0f);

        weights = findDataFile(weights, false);
        if (!proto.empty())
            proto = findDataFile(proto, false);
        if (backend == DNN_BACKEND_HALIDE)
        {
            if (halide_scheduler == "disabled")
                throw cvtest::SkipTestException("Halide test is disabled");
            if (!halide_scheduler.empty())
                halide_scheduler = findDataFile(std::string("dnn/halide_scheduler_") + (target == DNN_TARGET_OPENCL ? "opencl_" : "") + halide_scheduler, true);
        }
        if (framework == "caffe")
        {
            net = cv::dnn::readNetFromCaffe(proto, weights);
        }
        else if (framework == "torch")
        {
            net = cv::dnn::readNetFromTorch(weights);
        }
        else if (framework == "tensorflow")
        {
            net = cv::dnn::readNetFromTensorflow(weights, proto);
        }
        else
            CV_Error(Error::StsNotImplemented, "Unknown framework " + framework);

        net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
        net.setPreferableBackend(backend);
        net.setPreferableTarget(target);
        if (backend == DNN_BACKEND_HALIDE)
        {
            net.setHalideScheduler(halide_scheduler);
        }

        MatShape netInputShape = shape(1, 3, input.rows, input.cols);
        size_t weightsMemory = 0, blobsMemory = 0;
        net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
        int64 flops = net.getFLOPS(netInputShape);
        CV_Assert(flops > 0);

        net.forward(outputLayer); // warmup

        std::cout << "Memory consumption:" << std::endl;
        std::cout << "    Weights(parameters): " << divUp(weightsMemory, 1u<<20) << " Mb" << std::endl;
        std::cout << "    Blobs: " << divUp(blobsMemory, 1u<<20) << " Mb" << std::endl;
        std::cout << "Calculation complexity: " << flops * 1e-9 << " GFlops" << std::endl;

        PERF_SAMPLE_BEGIN()
            net.forward();
        PERF_SAMPLE_END()

        SANITY_CHECK_NOTHING();
    }
Ejemplo n.º 17
0
            static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
            {

                if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
                {
                    CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
                    return;
                }

                Context  *clCxt = mat_src.clCxt;
                int channels = mat_src.oclchannels();
                int depth = mat_src.depth();

                String kernelName = "split_vector";

                int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
                    {4, 4, 2, 2, 1, 1, 1},
                    {4, 4, 2, 2 , 1, 1, 1},
                    {4, 4, 2, 2, 1, 1, 1}
                };

                size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()];

                int max_offset_cols = 0;
                for(int i = 0; i < channels; i++)
                {
                    int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1);
                    if(max_offset_cols < offset_cols)
                        max_offset_cols = offset_cols;
                }

                int cols =  vector_length == 1 ? divUp(mat_src.cols, vector_length)
                            : divUp(mat_src.cols + max_offset_cols, vector_length);

                size_t localThreads[3]  = { 64, 4, 1 };
                size_t globalThreads[3] = { cols, mat_src.rows, 1 };

                int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
                std::vector<std::pair<size_t , const void *> > args;
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset));
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset));
                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset));
                if(channels >= 3)
                {

                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset));
                }
                if(channels >= 4)
                {
                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset));
                }

                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));

                openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
            }