ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { const clw::Image2D& input = reader.readSocket(0).getDeviceImageMono(); clw::Image2D& output = writer.acquireSocket(0).getDeviceImageMono(); int imageWidth = input.width(); int imageHeight = input.height(); if(imageWidth == 0 || imageHeight == 0) return ExecutionStatus(EStatus::Ok); clw::Kernel kernelConvertBayer2Gray; switch(_BayerCode.cast<Enum>().cast<cvu::EBayerCode>()) { case cvu::EBayerCode::RG: kernelConvertBayer2Gray = _gpuComputeModule->acquireKernel(_kidConvertRG2Gray); break; case cvu::EBayerCode::GB: kernelConvertBayer2Gray = _gpuComputeModule->acquireKernel(_kidConvertGB2Gray); break; case cvu::EBayerCode::GR: kernelConvertBayer2Gray = _gpuComputeModule->acquireKernel(_kidConvertGR2Gray); break; case cvu::EBayerCode::BG: kernelConvertBayer2Gray = _gpuComputeModule->acquireKernel(_kidConvertBG2Gray); break; } if(kernelConvertBayer2Gray.isNull()) return ExecutionStatus(EStatus::Error, "Bad bayer code"); // Ensure output image size is enough if(output.isNull() || output.width() != imageWidth || output.height() != imageHeight) { output = _gpuComputeModule->context().createImage2D( clw::EAccess::ReadWrite, clw::EMemoryLocation::Device, clw::ImageFormat(clw::EChannelOrder::R, clw::EChannelType::Normalized_UInt8), imageWidth, imageHeight); } cl_float3 gains = { (float) _redGain, (float) _greenGain, (float) _blueGain }; int sharedWidth = 16 + 2; int sharedHeight = 16 + 2; kernelConvertBayer2Gray.setLocalWorkSize(16, 16); kernelConvertBayer2Gray.setRoundedGlobalWorkSize(imageWidth, imageHeight); kernelConvertBayer2Gray.setArg(0, input); kernelConvertBayer2Gray.setArg(1, output); kernelConvertBayer2Gray.setArg(2, gains); kernelConvertBayer2Gray.setArg(3, clw::LocalMemorySize(sharedWidth*sharedHeight*sizeof(float))); kernelConvertBayer2Gray.setArg(4, sharedWidth); kernelConvertBayer2Gray.setArg(5, sharedHeight); _gpuComputeModule->queue().asyncRunKernel(kernelConvertBayer2Gray); _gpuComputeModule->queue().finish(); return ExecutionStatus(EStatus::Ok); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { // Read input sockets const cv::Mat& src = reader.readSocket(0).getImageMono(); // ouputs KeyPoints& kp = writer.acquireSocket(0).getKeypoints(); cv::Mat& descriptors = writer.acquireSocket(1).getArray(); // Validate inputs if(src.empty()) return ExecutionStatus(EStatus::Ok); (*_brisk)(src, cv::noArray(), kp.kpoints, descriptors); kp.image = src; return ExecutionStatus(EStatus::Ok, string_format("Keypoints detected: %d", (int) kp.kpoints.size())); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { // Read input sockets const KeyPoints& kp = reader.readSocket(0).getKeypoints(); // Validate inputs if(kp.kpoints.empty() || kp.image.empty()) return ExecutionStatus(EStatus::Ok); // Acquire output sockets KeyPoints& outKp = writer.acquireSocket(0).getKeypoints(); cv::Mat& outDescriptors = writer.acquireSocket(1).getArray(); outKp = kp; _brisk->compute(kp.image, outKp.kpoints, outDescriptors); return ExecutionStatus(EStatus::Ok); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { // Read input sockets const cv::Mat& src = reader.readSocket(0).getImageMono(); // Acquire output sockets KeyPoints& kp = writer.acquireSocket(0).getKeypoints(); cv::Mat& descriptors = writer.acquireSocket(1).getArray(); // Validate inputs if(src.empty()) return ExecutionStatus(EStatus::Ok); // Do stuff cv::SIFT sift(_nFeatures, _nOctaveLayers, _contrastThreshold, _edgeThreshold, _sigma); sift(src, cv::noArray(), kp.kpoints, descriptors); kp.image = src; return ExecutionStatus(EStatus::Ok, string_format("Keypoints detected: %d", (int) kp.kpoints.size())); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { // Read input sockets const cv::Mat& src = reader.readSocket(0).getImageMono(); // Acquire output sockets KeyPoints& kp = writer.acquireSocket(0).getKeypoints(); // Validate inputs if(src.empty()) return ExecutionStatus(EStatus::Ok); _brisk->detect(src, kp.kpoints); kp.image = src; return ExecutionStatus(EStatus::Ok, string_format("Keypoints detected: %d", (int) kp.kpoints.size())); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { const clw::Image2D& deviceImage = reader.readSocket(0).getDeviceImageMono(); KeyPoints& kp = writer.acquireSocket(0).getKeypoints(); cv::Mat& descriptors = writer.acquireSocket(1).getArray(); DeviceArray& descriptors_dev = writer.acquireSocket(2).getDeviceArray(); int imageWidth = deviceImage.width(); int imageHeight = deviceImage.height(); if(imageWidth == 0 || imageHeight == 0) return ExecutionStatus(EStatus::Ok); GpuPerformanceMarker marker(_gpuComputeModule->activityLogger(), "SURF"); if(!_constantsUploaded) uploadSurfConstants(); ensureKeypointsBufferIsEnough(); // Zero keypoints counter (use pinned memory) // Another way would be to map pinned buffer (allocated on a host) // and read the device buffer using just obtained pointer. If data transfer is single-direction // we don't need to map and unmap every time - just after creation and before destruction of a buffer // From AMD APP OpenCL Programming Guide: // pinnedBuffer = clCreateBuffer(CL_MEM_ALLOC_HOST_PTR or CL_MEM_USE_HOST_PTR) // deviceBuffer = clCreateBuffer() // 1) // void* pinnedMemory = clEnqueueMapBuffer(pinnedBuffer) -> can be done only once // clEnqueueRead/WriteBuffer(deviceBuffer, pinnedMemory) // clEnqueueUnmapMemObject(pinnedBuffer, pinnedMemory) -> can be done only once // // 2) // void* pinnedMemory = clEnqueueMapBuffer(pinnedBuffer) // [Application writes or modifies memory (host memory bandwith)] // clEnqueueUnmapMemObject(pinnedBuffer, pinnedMemory) // clEnqueueCopyBuffer(pinnedBuffer, deviceBuffer) // - or - // clEnqueueCopyBuffer(deviceBuffer, pinnedBuffer) // void* pinnedMemory = clEnqueueMapBuffer(pinnedBuffer) // [Application reads memory (host memory bandwith)] // clEnqueueUnmapMemObject(pinnedBuffer, pinnedMemory) // On AMD these are the same solutions int* intPtr = (int*) _gpuComputeModule->queue().mapBuffer(_pinnedKeypointsCount_cl, clw::EMapAccess::Write); if(!intPtr) return ExecutionStatus(EStatus::Error, "Couldn't mapped keypoints counter buffer to host address space"); intPtr[0] = 0; _gpuComputeModule->queue().asyncUnmap(_pinnedKeypointsCount_cl, intPtr); _gpuComputeModule->queue().asyncCopyBuffer(_pinnedKeypointsCount_cl, _keypointsCount_cl); convertImageToIntegral(deviceImage, imageWidth, imageHeight); prepareScaleSpaceLayers(imageWidth, imageHeight); buildScaleSpace(imageWidth, imageHeight); findScaleSpaceMaxima(); kp.image.create(imageHeight, imageWidth, CV_8UC1); _gpuComputeModule->dataQueue().asyncReadImage2D(deviceImage, kp.image.data, (int) kp.image.step); _gpuComputeModule->dataQueue().flush(); _gpuComputeModule->activityLogger().beginPerfMarker("Read number of keypoints", "SURF"); // Read keypoints counter (use pinned memory) _gpuComputeModule->queue().asyncCopyBuffer(_keypointsCount_cl, _pinnedKeypointsCount_cl); intPtr = (int*) _gpuComputeModule->queue().mapBuffer(_pinnedKeypointsCount_cl, clw::EMapAccess::Read); if(!intPtr) return ExecutionStatus(EStatus::Error, "Couldn't mapped keypoints counter buffer to host address space"); int keypointsCount = min(intPtr[0], kKeypointsMax); _gpuComputeModule->queue().asyncUnmap(_pinnedKeypointsCount_cl, intPtr); _gpuComputeModule->activityLogger().endPerfMarker(); if(keypointsCount > 0) { if(!_upright) findKeypointOrientation(keypointsCount); else uprightKeypointOrientation(keypointsCount); calculateDescriptors(keypointsCount); GpuPerformanceMarker marker(_gpuComputeModule->activityLogger(), "Read results", "SURF"); // Start copying descriptors to pinned buffer if(_downloadDescriptors) _gpuComputeModule->queue().asyncCopyBuffer(_descriptors_cl, _pinnedDescriptors_cl); vector<KeyPoint> kps = downloadKeypoints(keypointsCount); kp.kpoints = transformKeyPoint(kps); descriptors_dev = DeviceArray::createFromBuffer(_descriptors_cl, 64, keypointsCount, EDataType::Float); if(_downloadDescriptors) { descriptors.create(keypointsCount, 64, CV_32F); float* floatPtr = (float*) _gpuComputeModule->queue().mapBuffer(_pinnedDescriptors_cl, clw::EMapAccess::Read); if(!floatPtr) return ExecutionStatus(EStatus::Error, "Couldn't mapped descriptors buffer to host address space"); if(descriptors.step == 64*sizeof(float)) { //copy(floatPtr, floatPtr + 64*keypointsCount, descriptors.ptr<float>()); memcpy(descriptors.ptr<float>(), floatPtr, sizeof(float)*64 * keypointsCount); } else { for(int row = 0; row < keypointsCount; ++row) //copy(floatPtr + 64*row, floatPtr + 64*row + 64, descriptors.ptr<float>(row)); memcpy(descriptors.ptr<float>(row), floatPtr + 64*row, sizeof(float)*64); } _gpuComputeModule->queue().asyncUnmap(_pinnedDescriptors_cl, floatPtr); } // Finish downloading input image _gpuComputeModule->dataQueue().finish(); } else { // Finish downloading input image _gpuComputeModule->dataQueue().finish(); kp.kpoints = vector<cv::KeyPoint>(); descriptors = cv::Mat(); descriptors_dev = DeviceArray(); } return ExecutionStatus(EStatus::Ok, string_format("Keypoints detected: %d", (int) kp.kpoints.size())); }
ExecutionStatus execute(NodeSocketReader& reader, NodeSocketWriter& writer) override { const clw::Image2D& deviceImage = reader.readSocket(0).getDeviceImageMono(); clw::Image2D& deviceDest = writer.acquireSocket(0).getDeviceImageMono(); int srcWidth = deviceImage.width(); int srcHeight = deviceImage.height(); if(srcWidth == 0 || srcHeight == 0) return ExecutionStatus(EStatus::Ok); clw::Kernel kernelGaussMix = _gpuComputeModule->acquireKernel(_kidGaussMix); /* Create mixture data buffer */ resetMixturesState(srcWidth * srcHeight); if(deviceDest.isNull() || deviceDest.width() != srcWidth || deviceDest.height() != srcHeight) { // Obraz (w zasadzie maska) pierwszego planu deviceDest = _gpuComputeModule->context().createImage2D( clw::EAccess::ReadWrite, clw::EMemoryLocation::Device, clw::ImageFormat(clw::EChannelOrder::R, clw::EChannelType::Normalized_UInt8), srcWidth, srcHeight); } // Calculate dynamic learning rate (if necessary) ++_nframe; float alpha = _learningRate >= 0 && _nframe > 1 ? _learningRate : 1.0f/std::min(_nframe, _history.cast<int>()); kernelGaussMix.setLocalWorkSize(16, 16); kernelGaussMix.setRoundedGlobalWorkSize(srcWidth, srcHeight); kernelGaussMix.setArg(0, deviceImage); kernelGaussMix.setArg(1, deviceDest); kernelGaussMix.setArg(2, _mixtureDataBuffer); kernelGaussMix.setArg(3, _mixtureParamsBuffer); kernelGaussMix.setArg(4, alpha); _gpuComputeModule->queue().asyncRunKernel(kernelGaussMix); if(_showBackground) { clw::Image2D& deviceDestBackground = writer.acquireSocket(1).getDeviceImageMono(); if(deviceDestBackground.isNull() || deviceDestBackground.width() != srcWidth || deviceDestBackground.height() != srcHeight) { // Obraz (w zasadzie maska) pierwszego planu deviceDestBackground = _gpuComputeModule->context().createImage2D( clw::EAccess::ReadWrite, clw::EMemoryLocation::Device, clw::ImageFormat(clw::EChannelOrder::R, clw::EChannelType::Normalized_UInt8), srcWidth, srcHeight); } clw::Kernel kernelBackground = _gpuComputeModule->acquireKernel(_kidGaussBackground); kernelBackground.setLocalWorkSize(16, 16); kernelBackground.setRoundedGlobalWorkSize(srcWidth, srcHeight); kernelBackground.setArg(0, deviceDestBackground); kernelBackground.setArg(1, _mixtureDataBuffer); kernelBackground.setArg(2, _mixtureParamsBuffer); _gpuComputeModule->queue().asyncRunKernel(kernelBackground); } _gpuComputeModule->queue().finish(); return ExecutionStatus(EStatus::Ok); }