Пример #1
0
 void NetCaffe::initializationOnThread()
 {
     try
     {
         #ifdef USE_CAFFE
             // Initialize net
             #ifdef USE_CUDA
                 caffe::Caffe::set_mode(caffe::Caffe::GPU);
                 caffe::Caffe::SetDevice(upImpl->mGpuId);
             #else
                 caffe::Caffe::set_mode(caffe::Caffe::CPU);
             #endif
             upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
             upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
             #ifdef USE_CUDA
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
             // Set spOutputBlob
             upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
             if (upImpl->spOutputBlob == nullptr)
                 error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: "
                       + upImpl->mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
             #ifdef USE_CUDA
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #2
0
 void FaceExtractorCaffe::netInitializationOnThread()
 {
     try
     {
         #if defined USE_CAFFE
             // Logging
             log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
             // Initialize Caffe net
             upImpl->spNetCaffe->initializationOnThread();
             #ifdef USE_CUDA
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
             // Initialize blobs
             upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
             upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
             upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
             #ifdef USE_CUDA
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
             // Logging
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #3
0
std::pair<float *, unsigned int> VBOCudaMapper::mapVBOBuffer()
{
    cudaCheck(cudaGraphicsMapResources(1, &cuda_graphics_resource_, nullptr));
    
    float * cuda_data_ptr = nullptr;
    size_t byte_size;
    cudaCheck(cudaGraphicsResourceGetMappedPointer((void **)&cuda_data_ptr, &byte_size, cuda_graphics_resource_));
    
    return{ cuda_data_ptr, byte_size };
}
Пример #4
0
 inline void reshapeFaceExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
                                       std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
                                       boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
                                       std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
                                       std::shared_ptr<caffe::Blob<float>>& peaksBlob,
                                       const int gpuID)
 {
     try
     {
         // HeatMaps extractor blob and layer
         const bool mergeFirstDimension = true;
         resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
                                      FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
         // Pose extractor blob and layer
         maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
         // Cuda check
         #ifdef USE_CUDA
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #5
0
VBOCudaMapper::VBOCudaMapper(unsigned int VBO, unsigned int byte_size)
{
    glBindBuffer(GL_ARRAY_BUFFER, VBO);
    glBufferData(GL_ARRAY_BUFFER, byte_size, nullptr, GL_DYNAMIC_DRAW);
    glBindBuffer(GL_ARRAY_BUFFER, 0);

    cudaCheck(cudaGraphicsGLRegisterBuffer(&cuda_graphics_resource_, VBO, cudaGraphicsMapFlagsWriteDiscard));
}
Пример #6
0
void memoryInfo(void)
{
	size_t free;
	size_t total;
	
	cudaCheck(cudaMemGetInfo (&free,&total),"MemInfo11");
	
	printf("\n");
	printf("\nRANK=%d\n",RANK);
	printf("\nGPU total memory = % .2f MB\n",(float)total/1e6);
	printf("\nGPU free  memory = % .2f MB\n",(float)free/1e6);

}
Пример #7
0
 void HandRenderer::renderHandGpu(Array<float>& outputData, const std::array<Array<float>, 2>& handKeypoints)
 {
     try
     {
         // GPU rendering
         #ifndef CPU_ONLY
             const auto elementRendered = spElementToRender->load(); // I prefer std::round(T&) over intRound(T) for std::atomic
             const auto numberPeople = handKeypoints[0].getSize(0);
             // GPU rendering
             if (numberPeople > 0 && elementRendered == 0)
             {
                 cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr());
                 // Draw handKeypoints
                 const auto handArea = handKeypoints[0].getSize(1)*handKeypoints[0].getSize(2);
                 const auto handVolume = numberPeople * handArea;
                 cudaMemcpy(pGpuHand, handKeypoints[0].getConstPtr(), handVolume * sizeof(float), cudaMemcpyHostToDevice);
                 cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(), handVolume * sizeof(float), cudaMemcpyHostToDevice);
                 renderHandKeypointsGpu(*spGpuMemoryPtr, mFrameSize, pGpuHand, 2 * numberPeople, mRenderThreshold);
                 // CUDA check
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             }
             // GPU memory to CPU if last renderer
             gpuToCpuMemoryIfLastRenderer(outputData.getPtr());
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         // CPU_ONLY mode
         #else
             error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__);
             UNUSED(outputData);
             UNUSED(handKeypoints);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #8
0
 inline void reshapeNetCaffe(caffe::Net<float>* caffeNet, const std::vector<int>& dimensions)
 {
     try
     {
         caffeNet->blobs()[0]->Reshape(dimensions);
         caffeNet->Reshape();
         #ifdef USE_CUDA
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #9
0
 void NetCaffe::forwardPass(const Array<float>& inputData) const
 {
     try
     {
         #ifdef USE_CAFFE
             // Security checks
             if (inputData.empty())
                 error("The Array inputData cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
             if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3)
                 error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].",
                       __LINE__, __FUNCTION__, __FILE__);
             // Reshape Caffe net if required
             if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize()))
             {
                 upImpl->mNetInputSize4D = inputData.getSize();
                 reshapeNetCaffe(upImpl->upCaffeNet.get(), inputData.getSize());
             }
             // Copy frame data to GPU memory
             #ifdef USE_CUDA
                 auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
                 cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float),
                            cudaMemcpyHostToDevice);
             #elif defined USE_OPENCL
                 auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
                 cl::Buffer imageBuffer = cl::Buffer((cl_mem)gpuImagePtr, true);
                 op::OpenCL::getInstance(upImpl->mGpuId)->getQueue().enqueueWriteBuffer(imageBuffer, true, 0,
                                                                                        inputData.getVolume() * sizeof(float),
                                                                                        inputData.getConstPtr());
             #else
                 auto* cpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data();
                 std::copy(inputData.getConstPtr(), inputData.getConstPtr() + inputData.getVolume(), cpuImagePtr);
             #endif
             // Perform deep network forward pass
             upImpl->upCaffeNet->ForwardFrom(0);
             // Cuda checks
             #ifdef USE_CUDA
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
         #else
             UNUSED(inputData);
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #10
0
 void PoseGpuRenderer::initializationOnThread()
 {
     try
     {
         log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
         // GPU memory allocation for rendering
         #ifdef USE_CUDA
             cudaMalloc((void**)(&pGpuPose),
                        POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         #endif
         log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
     }
 }
Пример #11
0
 int getCudaGpuNumber()
 {
     try
     {
         #ifdef USE_CUDA
             int gpuNumber;
             cudaGetDeviceCount(&gpuNumber);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             return gpuNumber;
         #else
             error("OpenPose must be compiled with the `USE_CUDA` macro definition in order to use this"
                   " functionality.", __LINE__, __FUNCTION__, __FILE__);
             return -1;
         #endif
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
         return -1;
     }
 }
Пример #12
0
TEST(HalfPrec, cuda) {
  float hostFloats[] = {
    -1,
    -100,
    2.3,
    0.0,
    1.0,
    3867.2,
  };
  const auto N = sizeof(hostFloats) / sizeof(float);
  CUDA<float> devFloats(hostFloats, N);
  CUDA<half_t> devHalfs(N);

  halfprec_ToHalf(nullptr, devFloats.data(), devHalfs.data(), devFloats.size());
  cudaCheck(cudaDeviceSynchronize());

  {
    uint16_t cpuHalfs[N] = { 666 };
    facebook::math::Float16::encode(cpuHalfs, hostFloats, N);

    half_t convertedHalfs[N];
    devHalfs.toHost(convertedHalfs);
    for (int i = 0; i < N; i++) {
      // The CPU and GPU disagree by a digit sometimes because the GPU
      // is using a different rounding mode.
      EXPECT_NEAR(cpuHalfs[i], convertedHalfs[i], 1);
    }
  }

  CUDA<float> exploded(N);
  halfprec_ToFloat(nullptr, devHalfs.data(), exploded.data(), N);
  float postExpl[N];
  exploded.toHost(postExpl);
  for (int i = 0; i < N; i++) {
    auto thousandth = fabs(hostFloats[i] / 1000.0);
    EXPECT_NEAR(postExpl[i], hostFloats[i], thousandth);
  }
}
Пример #13
0
void VBOCudaMapper::unmapVBOBuffer()
{
    cudaCheck(cudaGraphicsUnmapResources(1, &cuda_graphics_resource_, nullptr));
}
Пример #14
0
int main() {
	//Checks for memory leaks in debug mode
	_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);

	glfwInit();
	glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
	glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4);
	glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
	glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);

	GLFWwindow* window = glfwCreateWindow(width, height, "Hikari", nullptr, nullptr);
	glfwMakeContextCurrent(window);

	//Set callbacks for keyboard and mouse
	glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED);

	glewExperimental = GL_TRUE;
	glewInit();
	glGetError();

	//Define the viewport dimensions
	glViewport(0, 0, width, height);

	//Initialize cuda->opengl context
	cudaCheck(cudaGLSetGLDevice(0));
	cudaGraphicsResource *resource;

	//Create a texture to store ray tracing result
	GLuint tex;
	glActiveTexture(GL_TEXTURE0);
	glGenTextures(1, &tex);
	glBindTexture(GL_TEXTURE_2D, tex);

	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGBA, GL_FLOAT, NULL);

	cudaCheck(cudaGraphicsGLRegisterImage(&resource, tex, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard));
	glBindTexture(GL_TEXTURE_2D, 0);

	Shader final = Shader("fsQuad.vert", "fsQuad.frag");
	FullscreenQuad fsQuad = FullscreenQuad();

	float4* buffer;
	cudaCheck(cudaMalloc((void**)&buffer, width * height * sizeof(float4)));
	cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4)));

	//Mesh
	float3 offset = make_float3(0);
	float3 scale = make_float3(15);
	Mesh cBox("objs/Avent", 0, scale, offset);
	offset = make_float3(0, 55, 0);
	scale = make_float3(100);
	Mesh light("objs/plane", (int)cBox.triangles.size(), scale, offset);
	cBox.triangles.insert(cBox.triangles.end(), light.triangles.begin(), light.triangles.end());
	cBox.aabbs.insert(cBox.aabbs.end(), light.aabbs.begin(), light.aabbs.end());
	std::cout << "Num triangles: " << cBox.triangles.size() << std::endl;
	cBox.root = AABB(fminf(cBox.root.minBounds, light.root.minBounds), fmaxf(cBox.root.maxBounds, light.root.maxBounds));
	BVH bvh(cBox.aabbs, cBox.triangles, cBox.root);

	Camera cam(make_float3(14, 15, 80), make_int2(width, height), 45.0f, 0.04f, 80.0f);
	Camera* dCam;

	cudaCheck(cudaMalloc((void**)&dCam, sizeof(Camera)));
	cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice));

	cudaCheck(cudaGraphicsMapResources(1, &resource, 0));
	cudaArray* pixels;
	cudaCheck(cudaGraphicsSubResourceGetMappedArray(&pixels, resource, 0, 0));
	cudaResourceDesc viewCudaArrayResourceDesc;
	viewCudaArrayResourceDesc.resType = cudaResourceTypeArray;
	viewCudaArrayResourceDesc.res.array.array = pixels;
	cudaSurfaceObject_t viewCudaSurfaceObject;
	cudaCheck(cudaCreateSurfaceObject(&viewCudaSurfaceObject, &viewCudaArrayResourceDesc));
	cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0));

	while (!glfwWindowShouldClose(window)) {
		float currentFrame = float(glfwGetTime());
		deltaTime = currentFrame - lastFrame;
		lastFrame = currentFrame;

		//Check and call events
		glfwPollEvents();
		handleInput(window, cam);

		if (cam.moved) {
			frameNumber = 0;
			cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4)));
		}

		cam.rebuildCamera();
		cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice));

		frameNumber++;
				
		if (frameNumber < 20000) {
			cudaCheck(cudaGraphicsMapResources(1, &resource, 0));
			std::chrono::time_point<std::chrono::system_clock> start, end;
			start = std::chrono::system_clock::now();
			render(cam, dCam, viewCudaSurfaceObject, buffer, bvh.dTriangles, bvh.dNodes, frameNumber, cam.moved);
			end = std::chrono::system_clock::now();
			std::chrono::duration<double> elapsed = end - start;
			std::cout << "Frame: " << frameNumber << " --- Elapsed time: " << elapsed.count() << "s\n";
			cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0));
		}

		cam.moved = false;

		glUseProgram(final.program);
		glActiveTexture(GL_TEXTURE0);
		glBindTexture(GL_TEXTURE_2D, tex);

		glClear(GL_COLOR_BUFFER_BIT);
		
		final.setUniformi("tRender", 0);
		fsQuad.render();

		//std::cout << glGetError() << std::endl;

		//Swap the buffers
		glfwSwapBuffers(window);
		glfwSetCursorPos(window, lastX, lastY);
	}
Пример #15
0
VBOCudaMapper::~VBOCudaMapper()
{
    cudaCheck(cudaGraphicsUnregisterResource(cuda_graphics_resource_));
}
Пример #16
0
 explicit CUDA(size_t n)
 : n_(n) {
   cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
   cudaCheck(cudaMemset(vals_, 0, n_ * sizeof(T)));
 }
Пример #17
0
VBOCudaMapper::VBOCudaMapper(unsigned int VBO)
{
    cudaCheck(cudaGraphicsGLRegisterBuffer(&cuda_graphics_resource_, VBO, cudaGraphicsMapFlagsWriteDiscard));
}
Пример #18
0
 std::pair<int, std::string> PoseGpuRenderer::renderPose(Array<float>& outputData,
                                                         const Array<float>& poseKeypoints,
                                                         const float scaleInputToOutput,
                                                         const float scaleNetToOutput)
 {
     try
     {
         // Security checks
         if (outputData.empty())
             error("Empty Array<float> outputData.", __LINE__, __FUNCTION__, __FILE__);
         // GPU rendering
         const auto elementRendered = spElementToRender->load();
         std::string elementRenderedName;
         #ifdef USE_CUDA
             const auto numberPeople = poseKeypoints.getSize(0);
             if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame)
             {
                 cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume());
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel);
                 const auto numberBodyPartsPlusBkg = numberBodyParts+1;
                 const auto numberBodyPAFChannels = getPosePartPairs(mPoseModel).size();
                 const Point<int> frameSize{outputData.getSize(1), outputData.getSize(0)};
                 // Draw poseKeypoints
                 if (elementRendered == 0)
                 {
                     // Rescale keypoints to output size
                     auto poseKeypointsRescaled = poseKeypoints.clone();
                     scaleKeypoints(poseKeypointsRescaled, scaleInputToOutput);
                     // Render keypoints
                     if (!poseKeypoints.empty())
                         cudaMemcpy(pGpuPose,
                                    poseKeypointsRescaled.getConstPtr(),
                                    numberPeople * numberBodyParts * 3 * sizeof(float),
                                    cudaMemcpyHostToDevice);
                     renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose,
                                            mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame,
                                            getAlphaKeypoint());
                 }
                 else
                 {
                     // If resized to input resolution: Replace scaleNetToOutput * scaleInputToOutput by
                     // scaleInputToOutput, and comment the security checks.
                     // Security checks
                     if (scaleNetToOutput == -1.f)
                         error("Non valid scaleNetToOutput.", __LINE__, __FUNCTION__, __FILE__);
                     // Parameters
                     const auto& heatMapSizes = spPoseExtractorNet->getHeatMapSize();
                     const Point<int> heatMapSize{heatMapSizes[3], heatMapSizes[2]};
                     const auto lastPAFChannel = numberBodyPartsPlusBkg+2+numberBodyPAFChannels/2;
                     // Add all heatmaps
                     if (elementRendered == 2)
                     // if (elementRendered == numberBodyPartsPlusBkg+1)
                     {
                         elementRenderedName = "Heatmaps";
                         renderPoseHeatMapsGpu(*spGpuMemory, mPoseModel, frameSize,
                                               spPoseExtractorNet->getHeatMapGpuConstPtr(),
                                               heatMapSize, scaleNetToOutput * scaleInputToOutput,
                                               (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f));
                     }
                     // Draw PAFs (Part Affinity Fields)
                     else if (elementRendered == 3)
                     // else if (elementRendered == numberBodyPartsPlusBkg+2)
                     {
                         elementRenderedName = "PAFs (Part Affinity Fields)";
                         renderPosePAFsGpu(*spGpuMemory, mPoseModel, frameSize,
                                           spPoseExtractorNet->getHeatMapGpuConstPtr(),
                                           heatMapSize, scaleNetToOutput * scaleInputToOutput,
                                           (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f));
                     }
                     // Draw specific body part or background
                     else if (elementRendered <= numberBodyPartsPlusBkg+2)
                     {
                         const auto realElementRendered = (elementRendered == 1
                                                             ? numberBodyParts
                                                             : elementRendered - 4);
                         elementRenderedName = mPartIndexToName.at(realElementRendered);
                         renderPoseHeatMapGpu(*spGpuMemory, mPoseModel, frameSize,
                                              spPoseExtractorNet->getHeatMapGpuConstPtr(),
                                              heatMapSize, scaleNetToOutput * scaleInputToOutput, realElementRendered,
                                              (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f));
                     }
                     // Draw affinity between 2 body parts
                     else if (elementRendered <= lastPAFChannel)
                     {
                         const auto affinityPart = (elementRendered-numberBodyPartsPlusBkg-3)*2;
                         const auto affinityPartMapped = numberBodyPartsPlusBkg
                                                       + getPoseMapIndex(mPoseModel).at(affinityPart);
                         elementRenderedName = mPartIndexToName.at(affinityPartMapped);
                         elementRenderedName = elementRenderedName.substr(0, elementRenderedName.find("("));
                         renderPosePAFGpu(*spGpuMemory, mPoseModel, frameSize,
                                          spPoseExtractorNet->getHeatMapGpuConstPtr(),
                                          heatMapSize, scaleNetToOutput * scaleInputToOutput, affinityPartMapped,
                                          (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f));
                     }
                     // Draw neck-part distance channel
                     else
                     {
                         if (mPoseModel != PoseModel::BODY_25D)
                             error("Neck-part distance channel only for BODY_25D.",
                                   __LINE__, __FUNCTION__, __FILE__);
                         const auto distancePart = (elementRendered - lastPAFChannel - 1);
                         const auto distancePartMapped = numberBodyPartsPlusBkg + numberBodyPAFChannels
                                                       + distancePart;
                         elementRenderedName = mPartIndexToName.at(distancePartMapped);
                         renderPoseDistance(*spGpuMemory, mPoseModel, frameSize,
                                            spPoseExtractorNet->getHeatMapGpuConstPtr(),
                                            heatMapSize, scaleNetToOutput * scaleInputToOutput, distancePartMapped,
                                            (mBlendOriginalFrame ? getAlphaHeatMap() : 1.f));
                     }
                 }
             }
             // GPU memory to CPU if last renderer
             gpuToCpuMemoryIfLastRenderer(outputData.getPtr(), outputData.getVolume());
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         #else
             UNUSED(outputData);
             UNUSED(poseKeypoints);
             UNUSED(scaleInputToOutput);
             UNUSED(scaleNetToOutput);
             error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this"
                   " functionality. You can alternatively use CPU rendering (flag `--render_pose 1`).",
                   __LINE__, __FUNCTION__, __FILE__);
         #endif
         // Return result
         return std::make_pair(elementRendered, elementRenderedName);
     }
     catch (const std::exception& e)
     {
         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
         return std::make_pair(-1, "");
     }
 }
Пример #19
0
 CUDA(const T* base, size_t n) :
 n_(n) {
   cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
   cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice));
 }
Пример #20
0
    void FaceExtractorCaffe::forwardPass(const std::vector<Rectangle<float>>& faceRectangles,
                                         const cv::Mat& cvInputData,
                                         const double scaleInputToOutput)
    {
        try
        {
            #if defined USE_CAFFE
                if (!faceRectangles.empty())
                {
                    // Security checks
                    if (cvInputData.empty())
                        error("Empty cvInputData.", __LINE__, __FUNCTION__, __FILE__);

                    // Fix parameters
                    const auto netInputSide = fastMin(mNetOutputSize.x, mNetOutputSize.y);

                    // Set face size
                    const auto numberPeople = (int)faceRectangles.size();
                    mFaceKeypoints.reset({numberPeople, (int)FACE_NUMBER_PARTS, 3}, 0);

                    // HeatMaps: define size
                    if (!mHeatMapTypes.empty())
                        mHeatMaps.reset({numberPeople, (int)FACE_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x});

                    // // Debugging
                    // cv::Mat cvInputDataCopy = cvInputData.clone();
                    // Extract face keypoints for each person
                    for (auto person = 0 ; person < numberPeople ; person++)
                    {
                        const auto& faceRectangle = faceRectangles.at(person);
                        // Only consider faces with a minimum pixel area
                        const auto minFaceSize = fastMin(faceRectangle.width, faceRectangle.height);
                        // // Debugging -> red rectangle
                        // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
                        // cv::rectangle(cvInputDataCopy,
                        //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
                        //               cv::Point{(int)faceRectangle.bottomRight().x,
                        //                         (int)faceRectangle.bottomRight().y},
                        //               cv::Scalar{0,0,255}, 2);
                        // Get parts
                        if (minFaceSize > 40)
                        {
                            // // Debugging -> green rectangle overwriting red one
                            // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
                            // cv::rectangle(cvInputDataCopy,
                            //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
                            //               cv::Point{(int)faceRectangle.bottomRight().x,
                            //                         (int)faceRectangle.bottomRight().y},
                            //               cv::Scalar{0,255,0}, 2);
                            // Resize and shift image to face rectangle positions
                            const auto faceSize = fastMax(faceRectangle.width, faceRectangle.height);
                            const double scaleFace = faceSize / (double)netInputSide;
                            cv::Mat Mscaling = cv::Mat::eye(2, 3, CV_64F);
                            Mscaling.at<double>(0,0) = scaleFace;
                            Mscaling.at<double>(1,1) = scaleFace;
                            Mscaling.at<double>(0,2) = faceRectangle.x;
                            Mscaling.at<double>(1,2) = faceRectangle.y;

                            cv::Mat faceImage;
                            cv::warpAffine(cvInputData, faceImage, Mscaling,
                                           cv::Size{mNetOutputSize.x, mNetOutputSize.y},
                                           CV_INTER_LINEAR | CV_WARP_INVERSE_MAP,
                                           cv::BORDER_CONSTANT, cv::Scalar(0,0,0));

                            // cv::Mat -> float*
                            uCharCvMatToFloatPtr(mFaceImageCrop.getPtr(), faceImage, true);

                            // // Debugging
                            // if (person < 5)
                            // cv::imshow("faceImage" + std::to_string(person), faceImage);

                            // 1. Caffe deep network
                            upImpl->spNetCaffe->forwardPass(mFaceImageCrop);

                            // Reshape blobs
                            if (!upImpl->netInitialized)
                            {
                                upImpl->netInitialized = true;
                                reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe,
                                                          upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob,
                                                          upImpl->spPeaksBlob, upImpl->mGpuId);
                            }

                            // 2. Resize heat maps + merge different scales
                            #ifdef USE_CUDA
                                upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
                                                                           {upImpl->spHeatMapsBlob.get()});
                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                            #elif USE_OPENCL
                                upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()},
                                                                           {upImpl->spHeatMapsBlob.get()});
                            #else
                                upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
                                                                           {upImpl->spHeatMapsBlob.get()});
                            #endif

                            // 3. Get peaks by Non-Maximum Suppression
                            #ifdef USE_CUDA
                                upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()},
                                                                    {upImpl->spPeaksBlob.get()});
                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                            #elif USE_OPENCL
                                // CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway
                                upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                            #else
                                upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()},
                                                                    {upImpl->spPeaksBlob.get()});
                            #endif

                            const auto* facePeaksPtr = upImpl->spPeaksBlob->mutable_cpu_data();
                            for (auto part = 0 ; part < mFaceKeypoints.getSize(1) ; part++)
                            {
                                const auto xyIndex = part * mFaceKeypoints.getSize(2);
                                const auto x = facePeaksPtr[xyIndex];
                                const auto y = facePeaksPtr[xyIndex + 1];
                                const auto score = facePeaksPtr[xyIndex + 2];
                                const auto baseIndex = mFaceKeypoints.getSize(2)
                                                     * (part + person * mFaceKeypoints.getSize(1));
                                mFaceKeypoints[baseIndex] = (float)(scaleInputToOutput
                                                                    * (Mscaling.at<double>(0,0) * x
                                                                       + Mscaling.at<double>(0,1) * y
                                                                       + Mscaling.at<double>(0,2)));
                                mFaceKeypoints[baseIndex+1] = (float)(scaleInputToOutput
                                                                      * (Mscaling.at<double>(1,0) * x
                                                                         + Mscaling.at<double>(1,1) * y
                                                                         + Mscaling.at<double>(1,2)));
                                mFaceKeypoints[baseIndex+2] = score;
                            }
                            // HeatMaps: storing
                            if (!mHeatMapTypes.empty()){
                                #ifdef USE_CUDA
                                    updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
                                                                upImpl->spHeatMapsBlob->gpu_data());
                                #else
                                    updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
                                                                upImpl->spHeatMapsBlob->cpu_data());
                                #endif
                            }
                        }
                    }
                    // // Debugging
                    // cv::imshow("AcvInputDataCopy", cvInputDataCopy);
                }
                else
                    mFaceKeypoints.reset();
            #else
                UNUSED(faceRectangles);
                UNUSED(cvInputData);
                UNUSED(scaleInputToOutput);
            #endif
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }
Пример #21
0
 void toHost(T* base) const {
   cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
 }
Пример #22
0
 ~CUDA() {
   cudaCheck(cudaFree(vals_));
 }