Ejemplo n.º 1
0
void runImageFilters(TColor *d_dst)
{
    switch(g_Kernel){
        case 0:
            cuda_Copy(d_dst, imageW, imageH);
        break;
        case 1:
            if(!g_Diag)
                cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC);
            else
                cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC);
        break;
        case 2:
            if(!g_Diag)
                cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); 
            else
                cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
        break;
        case 3:
            if(!g_Diag)
                cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); 
            else
                cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
        break;
    }
    cutilCheckMsg("Filtering kernel execution failed.\n");
}
void
benchmark(int iterations) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));

    // execute the kernel
    for(int i=0; i<iterations; i++) {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStopTimer( timer));

    // check if kernel execution generated an error
    cutilCheckMsg("Kernel execution failed");

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    cutilSafeCall(cudaFree(d_result));
}
Ejemplo n.º 3
0
/**
 * Selects a GPU for the CUDA execution.
 * @param id id of the selected GPU.
 */
void selectGPU(int id) {
    int deviceId;
    if (id == -1) {
        deviceId = cutGetMaxGflopsDeviceId();
    } else {
        deviceId = id;
    }

    cutilSafeCall(cudaSetDevice( deviceId ));
    cutilCheckMsg("cudaSetDevice failed");
}
Ejemplo n.º 4
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runBenchmark( int argc, char **argv )
{
    int devID = 0;
    shrLog("[runBenchmark]: [%s]\n", sSDKsample);
    devID = cutilChooseCudaDevice(argc, argv);

    loadImageData(argc, argv);

    initCuda();

    g_CheckRender       = new CheckBackBuffer(width, height, 4, false);
    g_CheckRender->setExecPath(argv[0]);

    unsigned int *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) );

    // warm-up
    boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads);
    cutilSafeCall( cutilDeviceSynchronize() );

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles);
    shrDeltaT(2);
    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads);
    }

    // check if kernel execution generated an error and sync host
    cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED");
    cutilSafeCall(cutilDeviceSynchronize());

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n",
             (1.0e-6 * width * height)/dProcessingTime, dProcessingTime,
             (width * height), 1, nthreads);
    shrLog("\n");
}
Ejemplo n.º 5
0
void runAutoTest(int argc, char **argv)
{
    int devID = 0;
    shrLog("[runAutoTest]: [%s] (automated testing w/ readback)\n", sSDKsample);

    devID = cutilChooseCudaDevice(argc, argv);

    loadImageData(argc, argv);

    initCuda();

    g_CheckRender       = new CheckBackBuffer(width, height, 4, false);
    g_CheckRender->setExecPath(argv[0]);

    unsigned int *d_result;
    cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) );

    for(int i = 0; i < 4; i++)
    {
        shrLog("[AutoTest]: %s (radius=%d)", sSDKsample, filter_radius );
        bilateralFilterRGBA(d_result, width, height, euclidean_delta, filter_radius, iterations, nthreads);

        // check if kernel execution generated an error
        cutilCheckMsg("Error: bilateralFilterRGBA Kernel execution FAILED");
        cutilSafeCall( cutilDeviceSynchronize() );
        cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost);

        g_CheckRender->savePPM(sOriginal[i], false, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[i], sReference[i], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        gaussian_delta += 1.0f;
        euclidean_delta *= 1.25f;

		updateGaussian(gaussian_delta, filter_radius);
    }

    cutilSafeCall( cudaFree( d_result ) );
    delete g_CheckRender;
}
void
runAutoTest(int argc, char **argv) 
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    cutilSafeCall( cudaMalloc( (void**) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    cutilSafeCall( cudaThreadSynchronize() );
    cutilCheckError( cutStartTimer( timer));
    
    while (sigma <= 22) {    
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
        cutilSafeCall( cudaThreadSynchronize() );
        // check if kernel execution generated an error
        cutilCheckMsg("Kernel execution failed");

        cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*4, cudaMemcpyDeviceToHost);
        g_CheckRender->savePPM(sOriginal[g_Index], false, NULL);
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f)) {
            g_TotalErrors++;
        }
        g_Index++;
        sigma += 4;
    }

    cutilCheckError( cutStopTimer( timer));

    printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
    printf("%.2f Mpixels/sec\n", (width*height*g_Index / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);

    printf("Summary: %d errors!\n", g_TotalErrors);
	printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED");
    

    cutilSafeCall(cudaFree(d_result));
}
Ejemplo n.º 7
0
// render image using CUDA
void render()
{
	copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);

    // map PBO to get CUDA device pointer
    uint *d_output;
	// map PBO to get CUDA device pointer
	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes; 
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes,  
						       cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

    // clear image
    cutilSafeCall(cudaMemset(d_output, 0, width*height*4));

    // call CUDA kernel, writing results to PBO
    render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);

    cutilCheckMsg("kernel failed");

    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
}
Ejemplo n.º 8
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    //start logs
    shrSetLogFileName ("volumeRender.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
	{
        g_bQAGLVerify = true;
        fpsLimit = frameCheckNumber;
    }

    if (g_bQAReadback) {
	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
        }

    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
        }
/*
        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if( !strncmp( "Tesla", prop.name, 5 ) ) {
            shrLog("This sample needs a card capable of OpenGL and display.\n");
            shrLog("Please choose a different device with the -device=x argument.\n");
            cutilExit(argc, argv);
        }
*/
	}

    // parse arguments
    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) {
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) {
        volumeSize.width = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) {
        volumeSize.height = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) {
         volumeSize.depth = n;
    }

    // load volume data
    char* path = shrFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        shrLog("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);
    
    initCuda(h_volume, volumeSize);
    free(h_volume);

    cutilCheckError( cutCreateTimer( &timer));

    shrLog("Press '=' and '-' to change density\n"
           "      ']' and '[' to change brightness\n"
           "      ';' and ''' to modify transfer function offset\n"
           "      '.' and ',' to modify transfer function scale\n\n");

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        uint *d_output;
        cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint)));
        cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint)));

        float modelView[16] = 
        {
            1.0f, 0.0f, 0.0f, 0.0f,
            0.0f, 1.0f, 0.0f, 0.0f,
            0.0f, 0.0f, 1.0f, 0.0f,
            0.0f, 0.0f, 4.0f, 1.0f
        };

        invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
        invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
        invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

        // call CUDA kernel, writing results to PBO
	    copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
        
        // Start timer 0 and process n loops on the GPU 
        int nIter = 10;
        for (int i = -1; i < nIter; i++)
        {
            if( i == 0 ) {
                cudaThreadSynchronize();
                cutStartTimer(timer); 
            }
            
            render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
        }
        cudaThreadSynchronize();
        cutStopTimer(timer);
        // Get elapsed time and throughput, then log to sample and master logs
        double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0);
        shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", 
               (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); 
        

        cutilCheckMsg("Error: render_kernel() execution FAILED");
        cutilSafeCall( cudaThreadSynchronize() );

        cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            shrLog("\nFAILED\n\n");
        } else {
            shrLog("\nPASSED\n\n");
        }

        cudaFree(d_output);
    	freeCudaBuffers();

        if (g_CheckRender) {
            delete g_CheckRender; g_CheckRender = NULL;
        }

    } else {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(width, height, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
        atexit(cleanup);

        glutMainLoop();
    }

    cudaThreadExit();
    shrEXIT(argc, (const char**)argv);
}
Ejemplo n.º 9
0
CUDASolverBundling::CUDASolverBundling(unsigned int maxNumberOfImages, unsigned int maxNumResiduals)
	: m_maxNumberOfImages(maxNumberOfImages)
	, THREADS_PER_BLOCK(512) // keep consistent with the GPU
{
	m_timer = NULL;
	//m_timer = new CUDATimer();
	//if (GlobalBundlingState::get().s_enableDetailedTimings) m_timer = new CUDATimer();
	m_bRecordConvergence = GlobalBundlingState::get().s_recordSolverConvergence;

	//TODO PARAMS
	const unsigned int submapSize = GlobalBundlingState::get().s_submapSize;
	m_verifyOptDistThresh = 0.02f;//GlobalAppState::get().s_verifyOptDistThresh;
	m_verifyOptPercentThresh = 0.05f;//GlobalAppState::get().s_verifyOptPercentThresh;

	const unsigned int numberOfVariables = maxNumberOfImages;
	m_maxCorrPerImage = math::clamp(maxNumResiduals / maxNumberOfImages, 1000u, 4000u);

	// State
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_deltaRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_deltaTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_zRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_zTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_pRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_pTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Jp, sizeof(float3)*maxNumResiduals));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Ap_XRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_Ap_XTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_scanAlpha, sizeof(float) * 2));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_rDotzOld, sizeof(float) *numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_precondionerRot, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_precondionerTrans, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_sumResidual, sizeof(float)));
	unsigned int n = (maxNumResiduals + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverExtra.d_maxResidual, sizeof(float) * n));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverExtra.d_maxResidualIndex, sizeof(int) * n));
	m_solverExtra.h_maxResidual = new float[n];
	m_solverExtra.h_maxResidualIndex = new int[n];

	MLIB_CUDA_SAFE_CALL(cudaMalloc(&d_variablesToCorrespondences, sizeof(int)*m_maxNumberOfImages*m_maxCorrPerImage));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&d_numEntriesPerRow, sizeof(int)*m_maxNumberOfImages));

	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_countHighResidual, sizeof(int)));

	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseJtJ, sizeof(float) * 36 * numberOfVariables * numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseJtr, sizeof(float) * 6 * numberOfVariables));
	m_maxNumDenseImPairs = m_maxNumberOfImages * (m_maxNumberOfImages - 1) / 2;
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseCorrCounts, sizeof(float) * m_maxNumDenseImPairs));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_denseOverlappingImages, sizeof(uint2) * m_maxNumDenseImPairs));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_numDenseOverlappingImages, sizeof(int)));

	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_corrCount, sizeof(int)));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_corrCountColor, sizeof(int)));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_sumResidualColor, sizeof(float)));

#ifdef USE_LIE_SPACE
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_xTransforms, sizeof(float4x4)*m_maxNumberOfImages));
	MLIB_CUDA_SAFE_CALL(cudaMalloc(&m_solverState.d_xTransformInverses, sizeof(float4x4)*m_maxNumberOfImages));
#else
	m_solverState.d_xTransforms = NULL;
	m_solverState.d_xTransformInverses = NULL;
#endif

#ifdef NEW_GUIDED_REMOVE
	cudaMalloc(&d_transforms, sizeof(float4x4)*m_maxNumberOfImages);
#endif

	//solve params
	m_maxResidualThresh = GlobalBundlingState::get().s_optMaxResThresh;
	m_defaultParams.denseDistThresh = GlobalBundlingState::get().s_denseDistThresh;
	m_defaultParams.denseNormalThresh = GlobalBundlingState::get().s_denseNormalThresh;
	m_defaultParams.denseColorThresh = GlobalBundlingState::get().s_denseColorThresh;
	m_defaultParams.denseColorGradientMin = GlobalBundlingState::get().s_denseColorGradientMin;
	m_defaultParams.denseDepthMin = GlobalBundlingState::get().s_denseDepthMin;
	m_defaultParams.denseDepthMax = GlobalBundlingState::get().s_denseDepthMax;
	m_defaultParams.denseOverlapCheckSubsampleFactor = GlobalBundlingState::get().s_denseOverlapCheckSubsampleFactor;

	//!!!DEBUGGING
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_deltaRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_deltaTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_zRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_zTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_pRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_pTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Jp, -1, sizeof(float3)*maxNumResiduals));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Ap_XRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_Ap_XTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_scanAlpha, -1, sizeof(float) * 2));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_rDotzOld, -1, sizeof(float) *numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_precondionerRot, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_precondionerTrans, -1, sizeof(float3)*numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_sumResidual, -1, sizeof(float)));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverExtra.d_maxResidual, -1, sizeof(float) * n));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverExtra.d_maxResidualIndex, -1, sizeof(int) * n));
	MLIB_CUDA_SAFE_CALL(cudaMemset(d_variablesToCorrespondences, -1, sizeof(int)*m_maxNumberOfImages*m_maxCorrPerImage));
	MLIB_CUDA_SAFE_CALL(cudaMemset(d_numEntriesPerRow, -1, sizeof(int)*m_maxNumberOfImages));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_countHighResidual, -1, sizeof(int)));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseJtJ, -1, sizeof(float) * 36 * numberOfVariables * numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseJtr, -1, sizeof(float) * 6 * numberOfVariables));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseCorrCounts, -1, sizeof(float) * m_maxNumDenseImPairs));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_denseOverlappingImages, -1, sizeof(uint2) * m_maxNumDenseImPairs));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_numDenseOverlappingImages, -1, sizeof(int)));

	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_corrCount, -1, sizeof(int)));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_corrCountColor, -1, sizeof(int)));
	MLIB_CUDA_SAFE_CALL(cudaMemset(m_solverState.d_sumResidualColor, -1, sizeof(float)));

	cutilSafeCall(cudaDeviceSynchronize());
	cutilCheckMsg(__FUNCTION__);
	//!!!DEBUGGING
}
Ejemplo n.º 10
0
void CMarchingCubes::ComputeIsosurface(ElemType* _pFval, ElemType _isoValue, RenderData* _pRender)
{
	int threads = 128;
	dim3 grid(m_NumVoxels / threads, 1, 1);
	// get around maximum grid size of 65535 in each dimension
	if (grid.x > 65535) {
		grid.y = grid.x / 32768;
		grid.x = 32768;
	}

	uint totalVerts = 0;
	int size = m_GridSize.x * m_GridSize.y * m_GridSize.z * sizeof(float);
	//////////////////////////////////////////////////////////////////////////
	int len = m_GridSize.x * m_GridSize.y * m_GridSize.z;
	float *pFvalTemp = new float[len];
	for (int i = 0; i < len; i++)
	{
		pFvalTemp[i] = _pFval[i];
	}
	//////////////////////////////////////////////////////////////////////////
	float* pdVolumeFval;				// ¶¥µãº¯ÊýÖµÎÆÀí(n¡¡Surface)
	cutilSafeCall(cudaMalloc((void**) &pdVolumeFval, size));
	cutilSafeCall(cudaMemcpy(pdVolumeFval, pFvalTemp, size, cudaMemcpyHostToDevice) );
	bindVolumeValTexture(pdVolumeFval);
	delete []pFvalTemp;

	// calculate number of vertices need per voxel
	launch_classifyVoxel(grid, threads, 
		m_pdVoxelVerts, m_pdVoxelOccupied, pdVolumeFval,
		m_GridSize,	m_NumVoxels, _isoValue);

#if DEBUG_BUFFERS
	printf("voxelVerts:\n");
	dumpBuffer(m_pdVoxelVerts, m_NumVoxels);
#endif

#if SKIP_EMPTY_VOXELS

	// scan voxel occupied array
	cudppScan(m_Scanplan, m_pdVoxelOccupiedScan, m_pdVoxelOccupied, m_NumVoxels);

#if DEBUG_BUFFERS
	printf("voxelOccupiedScan:\n");
	dumpBuffer(m_pdVoxelOccupiedScan, m_NumVoxels);
#endif

	// read back values to calculate total number of non-empty voxels
	// since we are using an exclusive scan, the total is the last value of
	// the scan result plus the last value in the input array
	{
		uint lastElement, lastScanElement;
		cutilSafeCall(cudaMemcpy((void *) &lastElement, 
			(void *) (m_pdVoxelOccupied + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
			(void *) (m_pdVoxelOccupiedScan + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		m_ActiveVoxels = lastElement + lastScanElement;
	}

	if (0 == m_ActiveVoxels) {
		// return if there are no full voxels
		totalVerts = 0;
		return;
	}

	// compact voxel index array
	launch_compactVoxels(grid, threads, m_pdCompactedVoxelArray, m_pdVoxelOccupied, m_pdVoxelOccupiedScan, m_NumVoxels);
	cutilCheckMsg("compactVoxels failed");

#endif // SKIP_EMPTY_VOXELS

	// scan voxel vertex count array
	cudppScan(m_Scanplan, m_pdVoxelVertsScan, m_pdVoxelVerts, m_NumVoxels);
#if DEBUG_BUFFERS
	printf("voxelVertsScan:\n");
	dumpBuffer(m_pdVoxelVertsScan, m_NumVoxels);
#endif

	// readback total number of vertices
	{
		uint lastElement, lastScanElement;
		cutilSafeCall(cudaMemcpy((void *) &lastElement, 
			(void *) (m_pdVoxelVerts + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
			(void *) (m_pdVoxelVertsScan + m_NumVoxels - 1), 
			sizeof(uint), cudaMemcpyDeviceToHost));
		totalVerts = lastElement + lastScanElement;
	}

	// create VBOs
	GLuint	posVbo, normalVbo;
	createVBO(&posVbo, totalVerts * sizeof(float) * 4);
	cutilSafeCall(cudaGLRegisterBufferObject(posVbo));
	createVBO(&normalVbo, totalVerts * sizeof(float) * 4);
	cutilSafeCall(cudaGLRegisterBufferObject(normalVbo));

	// generate triangles, writing to vertex buffers
	float4 *d_pos = 0, *d_normal = 0;
	cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo));
	cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo));

#if SKIP_EMPTY_VOXELS
	dim3 grid2((int) ceil(m_ActiveVoxels / (float) NTHREADS), 1, 1);
#else
	dim3 grid2((int) ceil(m_NumVoxels / (float) NTHREADS), 1, 1);
#endif
	while(grid2.x > 65535) {
		grid2.x/=2;
		grid2.y*=2;
	}

	launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, 
		m_pdCompactedVoxelArray, m_pdVoxelVertsScan,
		m_pdVolume, pdVolumeFval,
		m_GridSize, _isoValue,
		m_ActiveVoxels, m_MaxVerts);

	cutilSafeCall(cudaGLUnmapBufferObject(normalVbo));
	cutilSafeCall(cudaGLUnmapBufferObject(posVbo));

	_pRender->posVbo = posVbo;
	_pRender->normalVbo = normalVbo;
	_pRender->totalVerts = totalVerts;
	cutilSafeCall(cudaFree(pdVolumeFval));
}
Ejemplo n.º 11
0
T benchmarkReduceMax(int  n, 
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel, 
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  unsigned int timer,
                  T* h_odata,
                  T* d_idata, 
                  T* d_odata)
{
    T gpu_result = 0;
    bool needReadBack = true;

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;

        cutilDeviceSynchronize();
        cutilCheckError( cutStartTimer( timer));

        // execute the kernel
        maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        // check if kernel execution generated an error
        cutilCheckMsg("Kernel execution failed");

        if (cpuFinalReduction)
        {
            // sum partial sums from each block on CPU        
            // copy result from device to host
            cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) );

            for(int i=0; i<numBlocks; i++) 
            {
                gpu_result += h_odata[i];
            }

            needReadBack = false;
        }
        else
        {
            // sum partial block sums on GPU
            int s=numBlocks;
            int kernel = whichKernel;
            while(s > cpuFinalThreshold) 
            {
                int threads = 0, blocks = 0;
                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
                
                maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata);
                
                if (kernel < 3)
                    s = (s + threads - 1) / threads;
                else
                    s = (s + (threads*2-1)) / (threads*2);
            }
            
            if (s > 1)
            {
                // copy result from device to host
                cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) );

                for(int i=0; i < s; i++) 
                {
                    gpu_result += h_odata[i];
                }

                needReadBack = false;
            }
        }

        cutilDeviceSynchronize();
        cutilCheckError( cutStopTimer(timer) );      
    }

    if (needReadBack)
    {
        // copy final sum from device to host
        cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );
    }

    return gpu_result;
}
Ejemplo n.º 12
0
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void
computeIsosurface()
{
    int threads = 128;
    dim3 grid(numVoxels / threads, 1, 1);
    // get around maximum grid size of 65535 in each dimension
    if (grid.x > 65535) {
        grid.y = grid.x / 32768;
        grid.x = 32768;
    }

    // calculate number of vertices need per voxel
    launch_classifyVoxel(grid, threads, 
						d_voxelVerts, d_voxelOccupied, d_volume, 
						gridSize, gridSizeShift, gridSizeMask, 
						numVoxels, voxelSize, isoValue);
#if DEBUG_BUFFERS
    printf("voxelVerts:\n");
    dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint));
#endif

#if SKIP_EMPTY_VOXELS
    // scan voxel occupied array
    cudppScan(scanplan, d_voxelOccupiedScan, d_voxelOccupied, numVoxels);
#if DEBUG_BUFFERS
    printf("voxelOccupiedScan:\n");
    dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint));
#endif

    // read back values to calculate total number of non-empty voxels
    // since we are using an exclusive scan, the total is the last value of
    // the scan result plus the last value in the input array
    {
        uint lastElement, lastScanElement;
        cutilSafeCall(cudaMemcpy((void *) &lastElement, 
                       (void *) (d_voxelOccupied + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
                       (void *) (d_voxelOccupiedScan + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        activeVoxels = lastElement + lastScanElement;
    }

    if (activeVoxels==0) {
        // return if there are no full voxels
        totalVerts = 0;
        return;
    }

    // compact voxel index array
    launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, d_voxelOccupiedScan, numVoxels);
    cutilCheckMsg("compactVoxels failed");

#endif // SKIP_EMPTY_VOXELS

    // scan voxel vertex count array
    cudppScan(scanplan, d_voxelVertsScan, d_voxelVerts, numVoxels);
#if DEBUG_BUFFERS
    printf("voxelVertsScan:\n");
    dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint));
#endif

    // readback total number of vertices
    {
        uint lastElement, lastScanElement;
        cutilSafeCall(cudaMemcpy((void *) &lastElement, 
                       (void *) (d_voxelVerts + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        cutilSafeCall(cudaMemcpy((void *) &lastScanElement, 
                       (void *) (d_voxelVertsScan + numVoxels-1), 
                       sizeof(uint), cudaMemcpyDeviceToHost));
        totalVerts = lastElement + lastScanElement;
    }

    // generate triangles, writing to vertex buffers
    if (!g_bQAReadback) {
		size_t num_bytes;
	    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_pos, posVbo));
	    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0));
	    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_pos, &num_bytes, cuda_posvbo_resource));

	    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_normal, normalVbo));
	    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0));
	    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_normal, &num_bytes, cuda_normalvbo_resource));
    }

#if SKIP_EMPTY_VOXELS
    dim3 grid2((int) ceil(activeVoxels / (float) NTHREADS), 1, 1);
#else
    dim3 grid2((int) ceil(numVoxels / (float) NTHREADS), 1, 1);
#endif
    while(grid2.x > 65535) {
        grid2.x/=2;
        grid2.y*=2;
    }
#if SAMPLE_VOLUME
    launch_generateTriangles2(grid2, NTHREADS, d_pos, d_normal, 
                                            d_compVoxelArray, 
                                            d_voxelVertsScan, d_volume, 
                                            gridSize, gridSizeShift, gridSizeMask, 
                                            voxelSize, isoValue, activeVoxels, 
                                            maxVerts);
#else
    launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, 
                                           d_compVoxelArray, 
                                           d_voxelVertsScan, 
                                           gridSize, gridSizeShift, gridSizeMask, 
                                           voxelSize, isoValue, activeVoxels, 
                                           maxVerts);
#endif

    if (!g_bQAReadback) {
        // DEPRECATED: 		cutilSafeCall(cudaGLUnmapBufferObject(normalVbo));
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0));
        // DEPRECATED: 		cutilSafeCall(cudaGLUnmapBufferObject(posVbo));
		cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0));
    }
}
Ejemplo n.º 13
0
void displayFunc(void){
	cutStartTimer(hTimer);
    TColor *d_dst = NULL;
	size_t num_bytes;

    if(frameCounter++ == 0) cutResetTimer(hTimer);
    // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO));
    cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
	cutilCheckMsg("cudaGraphicsMapResources failed");
    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource));
	cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed");

    cutilSafeCall( CUDA_Bind2TextureArray()                      );

    runImageFilters(d_dst);

    cutilSafeCall( CUDA_UnbindTexture()     );
    // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO));
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

	if (g_bFBODisplay) {
		g_FrameBufferObject->bindRenderPath();
	}

    // Common display code path
	{
        glClear(GL_COLOR_BUFFER_BIT);

        glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) );
        glBegin(GL_TRIANGLES);
            glTexCoord2f(0, 0); glVertex2f(-1, -1);
            glTexCoord2f(2, 0); glVertex2f(+3, -1);
            glTexCoord2f(0, 2); glVertex2f(-1, +3);
        glEnd();
        glFinish();
    }

	if (g_bFBODisplay) {
		g_FrameBufferObject->unbindRenderPath();
        glBindTexture(GL_TEXTURE_2D, 0);
    }

    if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) {
        printf("> (Frame %d) readback BackBuffer\n", frameCount);
        if (g_bFBODisplay) {
            g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() );
        } else {
            g_CheckRender->readback( imageW, imageH );
        }
        g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL );
        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) {
            g_TotalErrors++;
        }
        g_Verify = false;
    }

    if(frameCounter == frameN){
        frameCounter = 0;
        if(g_FPS){
            printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) );
            g_FPS = false;
        }
    }

	glutSwapBuffers();

	cutStopTimer(hTimer);
	computeFPS();
}