bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { std::vector<UMat> inputs; std::vector<UMat> outputs; inps.getUMatVector(inputs); outs.getUMatVector(outputs); int _layerWidth = inputs[0].size[3]; int _layerHeight = inputs[0].size[2]; int _imageWidth = inputs[1].size[3]; int _imageHeight = inputs[1].size[2]; float stepX, stepY; if (_stepX == 0 || _stepY == 0) { stepX = static_cast<float>(_imageWidth) / _layerWidth; stepY = static_cast<float>(_imageHeight) / _layerHeight; } else { stepX = _stepX; stepY = _stepY; } if (umat_offsetsX.empty()) { Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]); Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]); Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]); Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]); offsetsX.copyTo(umat_offsetsX); offsetsY.copyTo(umat_offsetsY); aspectRatios.copyTo(umat_aspectRatios); variance.copyTo(umat_variance); int real_numPriors = _numPriors >> (_offsetsX.size() - 1); umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f); }
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { std::vector<UMat> inputs; std::vector<UMat> outputs; bool use_half = (inps.depth() == CV_16S); inps.getUMatVector(inputs); outs.getUMatVector(outputs); int _layerWidth = inputs[0].size[3]; int _layerHeight = inputs[0].size[2]; int _imageWidth = inputs[1].size[3]; int _imageHeight = inputs[1].size[2]; if (umat_offsetsX.empty()) { Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]); Mat offsetsY(1, _offsetsY.size(), CV_32FC1, &_offsetsY[0]); Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]); Mat widths(1, _boxWidths.size(), CV_32FC1, &_boxWidths[0]); Mat heights(1, _boxHeights.size(), CV_32FC1, &_boxHeights[0]); offsetsX.copyTo(umat_offsetsX); offsetsY.copyTo(umat_offsetsY); variance.copyTo(umat_variance); widths.copyTo(umat_widths); heights.copyTo(umat_heights); } String opts; if (use_half) opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4"; else opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4"; size_t nthreads = _layerHeight * _layerWidth; ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts); kernel.set(0, (int)nthreads); kernel.set(1, (float)_stepX); kernel.set(2, (float)_stepY); kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_offsetsX)); kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_offsetsY)); kernel.set(5, (int)_offsetsX.size()); kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_widths)); kernel.set(7, ocl::KernelArg::PtrReadOnly(umat_heights)); kernel.set(8, (int)_boxWidths.size()); kernel.set(9, ocl::KernelArg::PtrWriteOnly(outputs[0])); kernel.set(10, (int)_layerHeight); kernel.set(11, (int)_layerWidth); kernel.set(12, (int)_imageHeight); kernel.set(13, (int)_imageWidth); kernel.run(1, &nthreads, NULL, false); // clip the prior's coordidate such that it is within [0, 1] if (_clip) { Mat mat = outputs[0].getMat(ACCESS_READ); int aspect_count = (_maxSize > 0) ? 1 : 0; int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size()); float* outputPtr = mat.ptr<float>() + offset; int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4; for (size_t d = 0; d < _outChannelSize; ++d) { outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.); } } // set the variance. { ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts); int offset = total(shape(outputs[0]), 2); size_t nthreads = _layerHeight * _layerWidth * _numPriors; kernel.set(0, (int)nthreads); kernel.set(1, (int)offset); kernel.set(2, (int)_variance.size()); kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance)); kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0])); if (!kernel.run(1, &nthreads, NULL, false)) return false; } return true; }