void NetCaffe::initializationOnThread() { try { #ifdef USE_CAFFE // Initialize net #ifdef USE_CUDA caffe::Caffe::set_mode(caffe::Caffe::GPU); caffe::Caffe::SetDevice(upImpl->mGpuId); #else caffe::Caffe::set_mode(caffe::Caffe::CPU); #endif upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST}); upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel); #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif // Set spOutputBlob upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName); if (upImpl->spOutputBlob == nullptr) error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + upImpl->mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
void FaceExtractorCaffe::netInitializationOnThread() { try { #if defined USE_CAFFE // Logging log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); // Initialize Caffe net upImpl->spNetCaffe->initializationOnThread(); #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif // Initialize blobs upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob(); upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)}; upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)}; #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif // Logging log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
std::pair<float *, unsigned int> VBOCudaMapper::mapVBOBuffer() { cudaCheck(cudaGraphicsMapResources(1, &cuda_graphics_resource_, nullptr)); float * cuda_data_ptr = nullptr; size_t byte_size; cudaCheck(cudaGraphicsResourceGetMappedPointer((void **)&cuda_data_ptr, &byte_size, cuda_graphics_resource_)); return{ cuda_data_ptr, byte_size }; }
inline void reshapeFaceExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe, std::shared_ptr<MaximumCaffe<float>>& maximumCaffe, boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob, std::shared_ptr<caffe::Blob<float>>& heatMapsBlob, std::shared_ptr<caffe::Blob<float>>& peaksBlob, const int gpuID) { try { // HeatMaps extractor blob and layer const bool mergeFirstDimension = true; resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()}, FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID); // Pose extractor blob and layer maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}); // Cuda check #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
VBOCudaMapper::VBOCudaMapper(unsigned int VBO, unsigned int byte_size) { glBindBuffer(GL_ARRAY_BUFFER, VBO); glBufferData(GL_ARRAY_BUFFER, byte_size, nullptr, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); cudaCheck(cudaGraphicsGLRegisterBuffer(&cuda_graphics_resource_, VBO, cudaGraphicsMapFlagsWriteDiscard)); }
void memoryInfo(void) { size_t free; size_t total; cudaCheck(cudaMemGetInfo (&free,&total),"MemInfo11"); printf("\n"); printf("\nRANK=%d\n",RANK); printf("\nGPU total memory = % .2f MB\n",(float)total/1e6); printf("\nGPU free memory = % .2f MB\n",(float)free/1e6); }
void HandRenderer::renderHandGpu(Array<float>& outputData, const std::array<Array<float>, 2>& handKeypoints) { try { // GPU rendering #ifndef CPU_ONLY const auto elementRendered = spElementToRender->load(); // I prefer std::round(T&) over intRound(T) for std::atomic const auto numberPeople = handKeypoints[0].getSize(0); // GPU rendering if (numberPeople > 0 && elementRendered == 0) { cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr()); // Draw handKeypoints const auto handArea = handKeypoints[0].getSize(1)*handKeypoints[0].getSize(2); const auto handVolume = numberPeople * handArea; cudaMemcpy(pGpuHand, handKeypoints[0].getConstPtr(), handVolume * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(), handVolume * sizeof(float), cudaMemcpyHostToDevice); renderHandKeypointsGpu(*spGpuMemoryPtr, mFrameSize, pGpuHand, 2 * numberPeople, mRenderThreshold); // CUDA check cudaCheck(__LINE__, __FUNCTION__, __FILE__); } // GPU memory to CPU if last renderer gpuToCpuMemoryIfLastRenderer(outputData.getPtr()); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // CPU_ONLY mode #else error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__); UNUSED(outputData); UNUSED(handKeypoints); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
inline void reshapeNetCaffe(caffe::Net<float>* caffeNet, const std::vector<int>& dimensions) { try { caffeNet->blobs()[0]->Reshape(dimensions); caffeNet->Reshape(); #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
void NetCaffe::forwardPass(const Array<float>& inputData) const { try { #ifdef USE_CAFFE // Security checks if (inputData.empty()) error("The Array inputData cannot be empty.", __LINE__, __FUNCTION__, __FILE__); if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3) error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].", __LINE__, __FUNCTION__, __FILE__); // Reshape Caffe net if required if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize())) { upImpl->mNetInputSize4D = inputData.getSize(); reshapeNetCaffe(upImpl->upCaffeNet.get(), inputData.getSize()); } // Copy frame data to GPU memory #ifdef USE_CUDA auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data(); cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float), cudaMemcpyHostToDevice); #elif defined USE_OPENCL auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data(); cl::Buffer imageBuffer = cl::Buffer((cl_mem)gpuImagePtr, true); op::OpenCL::getInstance(upImpl->mGpuId)->getQueue().enqueueWriteBuffer(imageBuffer, true, 0, inputData.getVolume() * sizeof(float), inputData.getConstPtr()); #else auto* cpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data(); std::copy(inputData.getConstPtr(), inputData.getConstPtr() + inputData.getVolume(), cpuImagePtr); #endif // Perform deep network forward pass upImpl->upCaffeNet->ForwardFrom(0); // Cuda checks #ifdef USE_CUDA cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif #else UNUSED(inputData); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
void PoseGpuRenderer::initializationOnThread() { try { log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); // GPU memory allocation for rendering #ifdef USE_CUDA cudaMalloc((void**)(&pGpuPose), POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float)); cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
int getCudaGpuNumber() { try { #ifdef USE_CUDA int gpuNumber; cudaGetDeviceCount(&gpuNumber); cudaCheck(__LINE__, __FUNCTION__, __FILE__); return gpuNumber; #else error("OpenPose must be compiled with the `USE_CUDA` macro definition in order to use this" " functionality.", __LINE__, __FUNCTION__, __FILE__); return -1; #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); return -1; } }
TEST(HalfPrec, cuda) { float hostFloats[] = { -1, -100, 2.3, 0.0, 1.0, 3867.2, }; const auto N = sizeof(hostFloats) / sizeof(float); CUDA<float> devFloats(hostFloats, N); CUDA<half_t> devHalfs(N); halfprec_ToHalf(nullptr, devFloats.data(), devHalfs.data(), devFloats.size()); cudaCheck(cudaDeviceSynchronize()); { uint16_t cpuHalfs[N] = { 666 }; facebook::math::Float16::encode(cpuHalfs, hostFloats, N); half_t convertedHalfs[N]; devHalfs.toHost(convertedHalfs); for (int i = 0; i < N; i++) { // The CPU and GPU disagree by a digit sometimes because the GPU // is using a different rounding mode. EXPECT_NEAR(cpuHalfs[i], convertedHalfs[i], 1); } } CUDA<float> exploded(N); halfprec_ToFloat(nullptr, devHalfs.data(), exploded.data(), N); float postExpl[N]; exploded.toHost(postExpl); for (int i = 0; i < N; i++) { auto thousandth = fabs(hostFloats[i] / 1000.0); EXPECT_NEAR(postExpl[i], hostFloats[i], thousandth); } }
void VBOCudaMapper::unmapVBOBuffer() { cudaCheck(cudaGraphicsUnmapResources(1, &cuda_graphics_resource_, nullptr)); }
int main() { //Checks for memory leaks in debug mode _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); glfwInit(); glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4); glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); glfwWindowHint(GLFW_RESIZABLE, GL_FALSE); GLFWwindow* window = glfwCreateWindow(width, height, "Hikari", nullptr, nullptr); glfwMakeContextCurrent(window); //Set callbacks for keyboard and mouse glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED); glewExperimental = GL_TRUE; glewInit(); glGetError(); //Define the viewport dimensions glViewport(0, 0, width, height); //Initialize cuda->opengl context cudaCheck(cudaGLSetGLDevice(0)); cudaGraphicsResource *resource; //Create a texture to store ray tracing result GLuint tex; glActiveTexture(GL_TEXTURE0); glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, tex); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGBA, GL_FLOAT, NULL); cudaCheck(cudaGraphicsGLRegisterImage(&resource, tex, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard)); glBindTexture(GL_TEXTURE_2D, 0); Shader final = Shader("fsQuad.vert", "fsQuad.frag"); FullscreenQuad fsQuad = FullscreenQuad(); float4* buffer; cudaCheck(cudaMalloc((void**)&buffer, width * height * sizeof(float4))); cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); //Mesh float3 offset = make_float3(0); float3 scale = make_float3(15); Mesh cBox("objs/Avent", 0, scale, offset); offset = make_float3(0, 55, 0); scale = make_float3(100); Mesh light("objs/plane", (int)cBox.triangles.size(), scale, offset); cBox.triangles.insert(cBox.triangles.end(), light.triangles.begin(), light.triangles.end()); cBox.aabbs.insert(cBox.aabbs.end(), light.aabbs.begin(), light.aabbs.end()); std::cout << "Num triangles: " << cBox.triangles.size() << std::endl; cBox.root = AABB(fminf(cBox.root.minBounds, light.root.minBounds), fmaxf(cBox.root.maxBounds, light.root.maxBounds)); BVH bvh(cBox.aabbs, cBox.triangles, cBox.root); Camera cam(make_float3(14, 15, 80), make_int2(width, height), 45.0f, 0.04f, 80.0f); Camera* dCam; cudaCheck(cudaMalloc((void**)&dCam, sizeof(Camera))); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); cudaArray* pixels; cudaCheck(cudaGraphicsSubResourceGetMappedArray(&pixels, resource, 0, 0)); cudaResourceDesc viewCudaArrayResourceDesc; viewCudaArrayResourceDesc.resType = cudaResourceTypeArray; viewCudaArrayResourceDesc.res.array.array = pixels; cudaSurfaceObject_t viewCudaSurfaceObject; cudaCheck(cudaCreateSurfaceObject(&viewCudaSurfaceObject, &viewCudaArrayResourceDesc)); cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); while (!glfwWindowShouldClose(window)) { float currentFrame = float(glfwGetTime()); deltaTime = currentFrame - lastFrame; lastFrame = currentFrame; //Check and call events glfwPollEvents(); handleInput(window, cam); if (cam.moved) { frameNumber = 0; cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); } cam.rebuildCamera(); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); frameNumber++; if (frameNumber < 20000) { cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); render(cam, dCam, viewCudaSurfaceObject, buffer, bvh.dTriangles, bvh.dNodes, frameNumber, cam.moved); end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << "Frame: " << frameNumber << " --- Elapsed time: " << elapsed.count() << "s\n"; cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); } cam.moved = false; glUseProgram(final.program); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, tex); glClear(GL_COLOR_BUFFER_BIT); final.setUniformi("tRender", 0); fsQuad.render(); //std::cout << glGetError() << std::endl; //Swap the buffers glfwSwapBuffers(window); glfwSetCursorPos(window, lastX, lastY); }
VBOCudaMapper::~VBOCudaMapper() { cudaCheck(cudaGraphicsUnregisterResource(cuda_graphics_resource_)); }
explicit CUDA(size_t n) : n_(n) { cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T))); cudaCheck(cudaMemset(vals_, 0, n_ * sizeof(T))); }
VBOCudaMapper::VBOCudaMapper(unsigned int VBO) { cudaCheck(cudaGraphicsGLRegisterBuffer(&cuda_graphics_resource_, VBO, cudaGraphicsMapFlagsWriteDiscard)); }
std::pair<int, std::string> PoseGpuRenderer::renderPose(Array<float>& outputData, const Array<float>& poseKeypoints, const float scaleInputToOutput, const float scaleNetToOutput) { try { // Security checks if (outputData.empty()) error("Empty Array<float> outputData.", __LINE__, __FUNCTION__, __FILE__); // GPU rendering const auto elementRendered = spElementToRender->load(); std::string elementRenderedName; #ifdef USE_CUDA const auto numberPeople = poseKeypoints.getSize(0); if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame) { cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume()); cudaCheck(__LINE__, __FUNCTION__, __FILE__); const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel); const auto numberBodyPartsPlusBkg = numberBodyParts+1; const auto numberBodyPAFChannels = getPosePartPairs(mPoseModel).size(); const Point<int> frameSize{outputData.getSize(1), outputData.getSize(0)}; // Draw poseKeypoints if (elementRendered == 0) { // Rescale keypoints to output size auto poseKeypointsRescaled = poseKeypoints.clone(); scaleKeypoints(poseKeypointsRescaled, scaleInputToOutput); // Render keypoints if (!poseKeypoints.empty()) cudaMemcpy(pGpuPose, poseKeypointsRescaled.getConstPtr(), numberPeople * numberBodyParts * 3 * sizeof(float), cudaMemcpyHostToDevice); renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose, mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint()); } else { // If resized to input resolution: Replace scaleNetToOutput * scaleInputToOutput by // scaleInputToOutput, and comment the security checks. // Security checks if (scaleNetToOutput == -1.f) error("Non valid scaleNetToOutput.", __LINE__, __FUNCTION__, __FILE__); // Parameters const auto& heatMapSizes = spPoseExtractorNet->getHeatMapSize(); const Point<int> heatMapSize{heatMapSizes[3], heatMapSizes[2]}; const auto lastPAFChannel = numberBodyPartsPlusBkg+2+numberBodyPAFChannels/2; // Add all heatmaps if (elementRendered == 2) // if (elementRendered == numberBodyPartsPlusBkg+1) { elementRenderedName = "Heatmaps"; renderPoseHeatMapsGpu(*spGpuMemory, mPoseModel, frameSize, spPoseExtractorNet->getHeatMapGpuConstPtr(), heatMapSize, scaleNetToOutput * scaleInputToOutput, (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f)); } // Draw PAFs (Part Affinity Fields) else if (elementRendered == 3) // else if (elementRendered == numberBodyPartsPlusBkg+2) { elementRenderedName = "PAFs (Part Affinity Fields)"; renderPosePAFsGpu(*spGpuMemory, mPoseModel, frameSize, spPoseExtractorNet->getHeatMapGpuConstPtr(), heatMapSize, scaleNetToOutput * scaleInputToOutput, (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f)); } // Draw specific body part or background else if (elementRendered <= numberBodyPartsPlusBkg+2) { const auto realElementRendered = (elementRendered == 1 ? numberBodyParts : elementRendered - 4); elementRenderedName = mPartIndexToName.at(realElementRendered); renderPoseHeatMapGpu(*spGpuMemory, mPoseModel, frameSize, spPoseExtractorNet->getHeatMapGpuConstPtr(), heatMapSize, scaleNetToOutput * scaleInputToOutput, realElementRendered, (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f)); } // Draw affinity between 2 body parts else if (elementRendered <= lastPAFChannel) { const auto affinityPart = (elementRendered-numberBodyPartsPlusBkg-3)*2; const auto affinityPartMapped = numberBodyPartsPlusBkg + getPoseMapIndex(mPoseModel).at(affinityPart); elementRenderedName = mPartIndexToName.at(affinityPartMapped); elementRenderedName = elementRenderedName.substr(0, elementRenderedName.find("(")); renderPosePAFGpu(*spGpuMemory, mPoseModel, frameSize, spPoseExtractorNet->getHeatMapGpuConstPtr(), heatMapSize, scaleNetToOutput * scaleInputToOutput, affinityPartMapped, (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f)); } // Draw neck-part distance channel else { if (mPoseModel != PoseModel::BODY_25D) error("Neck-part distance channel only for BODY_25D.", __LINE__, __FUNCTION__, __FILE__); const auto distancePart = (elementRendered - lastPAFChannel - 1); const auto distancePartMapped = numberBodyPartsPlusBkg + numberBodyPAFChannels + distancePart; elementRenderedName = mPartIndexToName.at(distancePartMapped); renderPoseDistance(*spGpuMemory, mPoseModel, frameSize, spPoseExtractorNet->getHeatMapGpuConstPtr(), heatMapSize, scaleNetToOutput * scaleInputToOutput, distancePartMapped, (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f)); } } } // GPU memory to CPU if last renderer gpuToCpuMemoryIfLastRenderer(outputData.getPtr(), outputData.getVolume()); cudaCheck(__LINE__, __FUNCTION__, __FILE__); #else UNUSED(outputData); UNUSED(poseKeypoints); UNUSED(scaleInputToOutput); UNUSED(scaleNetToOutput); error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this" " functionality. You can alternatively use CPU rendering (flag `--render_pose 1`).", __LINE__, __FUNCTION__, __FILE__); #endif // Return result return std::make_pair(elementRendered, elementRenderedName); } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); return std::make_pair(-1, ""); } }
CUDA(const T* base, size_t n) : n_(n) { cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T))); cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice)); }
void FaceExtractorCaffe::forwardPass(const std::vector<Rectangle<float>>& faceRectangles, const cv::Mat& cvInputData, const double scaleInputToOutput) { try { #if defined USE_CAFFE if (!faceRectangles.empty()) { // Security checks if (cvInputData.empty()) error("Empty cvInputData.", __LINE__, __FUNCTION__, __FILE__); // Fix parameters const auto netInputSide = fastMin(mNetOutputSize.x, mNetOutputSize.y); // Set face size const auto numberPeople = (int)faceRectangles.size(); mFaceKeypoints.reset({numberPeople, (int)FACE_NUMBER_PARTS, 3}, 0); // HeatMaps: define size if (!mHeatMapTypes.empty()) mHeatMaps.reset({numberPeople, (int)FACE_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x}); // // Debugging // cv::Mat cvInputDataCopy = cvInputData.clone(); // Extract face keypoints for each person for (auto person = 0 ; person < numberPeople ; person++) { const auto& faceRectangle = faceRectangles.at(person); // Only consider faces with a minimum pixel area const auto minFaceSize = fastMin(faceRectangle.width, faceRectangle.height); // // Debugging -> red rectangle // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows)); // cv::rectangle(cvInputDataCopy, // cv::Point{(int)faceRectangle.x, (int)faceRectangle.y}, // cv::Point{(int)faceRectangle.bottomRight().x, // (int)faceRectangle.bottomRight().y}, // cv::Scalar{0,0,255}, 2); // Get parts if (minFaceSize > 40) { // // Debugging -> green rectangle overwriting red one // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows)); // cv::rectangle(cvInputDataCopy, // cv::Point{(int)faceRectangle.x, (int)faceRectangle.y}, // cv::Point{(int)faceRectangle.bottomRight().x, // (int)faceRectangle.bottomRight().y}, // cv::Scalar{0,255,0}, 2); // Resize and shift image to face rectangle positions const auto faceSize = fastMax(faceRectangle.width, faceRectangle.height); const double scaleFace = faceSize / (double)netInputSide; cv::Mat Mscaling = cv::Mat::eye(2, 3, CV_64F); Mscaling.at<double>(0,0) = scaleFace; Mscaling.at<double>(1,1) = scaleFace; Mscaling.at<double>(0,2) = faceRectangle.x; Mscaling.at<double>(1,2) = faceRectangle.y; cv::Mat faceImage; cv::warpAffine(cvInputData, faceImage, Mscaling, cv::Size{mNetOutputSize.x, mNetOutputSize.y}, CV_INTER_LINEAR | CV_WARP_INVERSE_MAP, cv::BORDER_CONSTANT, cv::Scalar(0,0,0)); // cv::Mat -> float* uCharCvMatToFloatPtr(mFaceImageCrop.getPtr(), faceImage, true); // // Debugging // if (person < 5) // cv::imshow("faceImage" + std::to_string(person), faceImage); // 1. Caffe deep network upImpl->spNetCaffe->forwardPass(mFaceImageCrop); // Reshape blobs if (!upImpl->netInitialized) { upImpl->netInitialized = true; reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe, upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob, upImpl->spPeaksBlob, upImpl->mGpuId); } // 2. Resize heat maps + merge different scales #ifdef USE_CUDA upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); cudaCheck(__LINE__, __FUNCTION__, __FILE__); #elif USE_OPENCL upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); #else upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); #endif // 3. Get peaks by Non-Maximum Suppression #ifdef USE_CUDA upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); cudaCheck(__LINE__, __FUNCTION__, __FILE__); #elif USE_OPENCL // CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); #else upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); #endif const auto* facePeaksPtr = upImpl->spPeaksBlob->mutable_cpu_data(); for (auto part = 0 ; part < mFaceKeypoints.getSize(1) ; part++) { const auto xyIndex = part * mFaceKeypoints.getSize(2); const auto x = facePeaksPtr[xyIndex]; const auto y = facePeaksPtr[xyIndex + 1]; const auto score = facePeaksPtr[xyIndex + 2]; const auto baseIndex = mFaceKeypoints.getSize(2) * (part + person * mFaceKeypoints.getSize(1)); mFaceKeypoints[baseIndex] = (float)(scaleInputToOutput * (Mscaling.at<double>(0,0) * x + Mscaling.at<double>(0,1) * y + Mscaling.at<double>(0,2))); mFaceKeypoints[baseIndex+1] = (float)(scaleInputToOutput * (Mscaling.at<double>(1,0) * x + Mscaling.at<double>(1,1) * y + Mscaling.at<double>(1,2))); mFaceKeypoints[baseIndex+2] = score; } // HeatMaps: storing if (!mHeatMapTypes.empty()){ #ifdef USE_CUDA updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode, upImpl->spHeatMapsBlob->gpu_data()); #else updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode, upImpl->spHeatMapsBlob->cpu_data()); #endif } } } // // Debugging // cv::imshow("AcvInputDataCopy", cvInputDataCopy); } else mFaceKeypoints.reset(); #else UNUSED(faceRectangles); UNUSED(cvInputData); UNUSED(scaleInputToOutput); #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } }
void toHost(T* base) const { cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost)); }
~CUDA() { cudaCheck(cudaFree(vals_)); }