Esempio n. 1
0
void keyboard(unsigned char key, int x, int y)
{
    switch (key)
    {
        case 27:
            g_bExitESC = true;
            exit(EXIT_SUCCESS);
            break;

        case 'r':
            memset(hvfield, 0, sizeof(cData) * DS);
            cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
                       cudaMemcpyHostToDevice);

            initParticles(particles, DIM, DIM);

            cudaGraphicsUnregisterResource(cuda_vbo_resource);

            getLastCudaError("cudaGraphicsUnregisterBuffer failed");

            glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
            glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                            particles, GL_DYNAMIC_DRAW_ARB);
            glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

            cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);

            getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
            break;

        default:
            break;
    }
}
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda()
{
    cudaStream_t    stream = 0;
    const int nbResources = 2;
    cudaGraphicsResource *ppResources[nbResources] =
    {
        g_histogram.cudaResource,
        g_color.cudaResource,
    };
    // Map resources for Cuda
    checkCudaErrors(cudaGraphicsMapResources(nbResources, ppResources, stream));
    getLastCudaError("cudaGraphicsMapResources(2) failed");
    // Get pointers
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_histogram.cudaBuffer, &g_histogram.size, g_histogram.cudaResource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer (g_color.pBuffer) failed");
    cudaGraphicsSubResourceGetMappedArray(&g_color.pCudaArray, g_color.cudaResource, 0, 0);
    getLastCudaError("cudaGraphicsSubResourceGetMappedArray (g_color.pBuffer) failed");

    // Execute kernel
    createHistogramTex(g_histogram.cudaBuffer, g_WindowWidth, g_WindowHeight, g_color.pCudaArray);
    checkCudaError();
    //
    // unmap the resources
    //
    checkCudaErrors(cudaGraphicsUnmapResources(nbResources, ppResources, stream));
    getLastCudaError("cudaGraphicsUnmapResources(2) failed");
}
Esempio n. 3
0
//-----------------------------------------------------------------------------
// Name: Cleanup()
// Desc: Releases all previously initialized objects
//-----------------------------------------------------------------------------
VOID Cleanup()
{
    if (g_pVB != NULL)
    {

        // Unregister vertex buffer
        // DEPRECATED: checkCudaErrors(cudaD3D10UnregisterResource(g_pVB));
        cudaGraphicsUnregisterResource(cuda_VB_resource);
        getLastCudaError("cudaGraphicsUnregisterResource failed");

        g_pVB->Release();
    }

    if (g_pInputLayout != NULL)
        g_pInputLayout->Release();

    if (g_pSimpleEffect != NULL)
        g_pSimpleEffect->Release();

    if (g_pSwapChainRTV != NULL)
        g_pSwapChainRTV->Release();

    if (g_pSwapChain != NULL)
        g_pSwapChain->Release();

    // Uninitialize CUDA

    cudaDeviceReset();
    getLastCudaError("cudaDeviceReset failed");

    if (g_pd3dDevice != NULL)
        g_pd3dDevice->Release();


}
Esempio n. 4
0
//-----------------------------------------------------------------------------
// Name: RunCUDA()
// Desc: Launches the CUDA kernels to fill in the texture data
//-----------------------------------------------------------------------------
void RunCUDA()
{
    //
    // map the resources we've registered so we can access them in Cuda
    // - it is most efficient to map and unmap all resources in a single call,
    //   and to have the map/unmap calls be the boundary between using the GPU
    //   for Direct3D and Cuda
    //

    if (!g_bDeviceLost)
    {
        cudaStream_t    stream = 0;
        const int nbResources = 3;
        cudaGraphicsResource *ppResources[nbResources] =
        {
            g_texture_2d.cudaResource,
            g_texture_vol.cudaResource,
            g_texture_cube.cudaResource,
        };
        cudaGraphicsMapResources(nbResources, ppResources, stream);
        getLastCudaError("cudaGraphicsMapResources(3) failed");

        //
        // run kernels which will populate the contents of those textures
        //
        RunKernels();

        //
        // unmap the resources
        //
        cudaGraphicsUnmapResources(nbResources, ppResources, stream);
        getLastCudaError("cudaGraphicsUnmapResources(3) failed");
    }
}
void displayFunc(void)
{
    sdkStartTimer(&timer);
    TColor *d_dst = NULL;
    size_t num_bytes;

    if (frameCounter++ == 0)
    {
        sdkResetTimer(&timer);
    }

    // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO));
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    getLastCudaError("cudaGraphicsMapResources failed");
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer failed");

    checkCudaErrors(CUDA_Bind2TextureArray());

    runImageFilters(d_dst);

    checkCudaErrors(CUDA_UnbindTexture());
    // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

    // Common display code path
    {
        glClear(GL_COLOR_BUFFER_BIT);

        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0));
        glBegin(GL_TRIANGLES);
        glTexCoord2f(0, 0);
        glVertex2f(-1, -1);
        glTexCoord2f(2, 0);
        glVertex2f(+3, -1);
        glTexCoord2f(0, 2);
        glVertex2f(-1, +3);
        glEnd();
        glFinish();
    }

    if (frameCounter == frameN)
    {
        frameCounter = 0;

        if (g_FPS)
        {
            printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001));
            g_FPS = false;
        }
    }

    glutSwapBuffers();
    glutReportErrors();

    sdkStopTimer(&timer);

    computeFPS();
}
Esempio n. 6
0
void keyboard(unsigned char key, int x, int y)
{
    switch (key)
    {
        case 27:
            g_bExitESC = true;
            exit(EXIT_SUCCESS);
            break;
        
        case 'f':
        	if (!fullscreen)
        	{
        		fullscreen = 1;
				glutFullScreenToggle();
			}
			else
			{
				fullscreen = 0;
				glutLeaveFullScreen();
			}
			break;

        case 'r':
			pthread_mutex_lock(&display_mutex);
            memset(hvfield, 0, sizeof(cData) * DS);
            cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
                       cudaMemcpyHostToDevice);

            initParticles(particles, DIM, DIM);

#ifndef OPTIMUS
            cudaGraphicsUnregisterResource(cuda_vbo_resource);
            getLastCudaError("cudaGraphicsUnregisterBuffer failed");
#endif

#if defined(OPTIMUS) || defined(BROADCAST)
            cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice);
#endif

            glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
            glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                            particles, GL_DYNAMIC_DRAW_ARB);
            glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

#ifndef OPTIMUS
            cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
            getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
#endif
			pthread_mutex_unlock(&display_mutex);
            break;

        default:
            break;
    }
}
Esempio n. 7
0
void runAutoTest(const char *ref_file, char *exec_path)
{
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4));

    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, w);

    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("render_kernel failed");

    void *h_output = malloc(width*height*sizeof(GLubyte)*4);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost));
    sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin");

    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height,
                                              MAX_EPSILON_ERROR, THRESHOLD, exec_path);

    checkCudaErrors(cudaFree(d_output));
    free(h_output);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    sdkStopTimer(&timer);
    sdkDeleteTimer(&timer);

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void
benchmark(int iterations)
{
    // allocate memory for result
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    checkCudaErrors(cudaMalloc((void **) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStartTimer(&timer);

    // execute the kernel
    for (int i = 0; i < iterations; i++)
    {
        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

    checkCudaErrors(cudaFree(d_result));
}
Esempio n. 9
0
HRESULT InitCUDA()
{
    printf("InitCUDA() g_pD3DDevice = %p\n", g_pD3DDevice);

    // Now we need to bind a CUDA context to the DX9 device
    // This is the CUDA 2.0 DX9 interface (required for Windows XP and Vista)
    cudaD3D9SetDirect3DDevice(g_pD3DDevice);
    getLastCudaError("cudaD3D9SetDirect3DDevice failed");

    return S_OK;
}
Esempio n. 10
0
//-----------------------------------------------------------------------------
// Name: ReleaseTextures()
// Desc: Release Direct3D Textures (free-ing)
//-----------------------------------------------------------------------------
HRESULT ReleaseTextures()
{
    // unregister the Cuda resources
    cudaGraphicsUnregisterResource(g_texture_2d.cudaResource);
    getLastCudaError("cudaGraphicsUnregisterResource (g_texture_2d) failed");
    cudaFree(g_texture_2d.cudaLinearMemory);
    getLastCudaError("cudaFree (g_texture_2d) failed");

    cudaGraphicsUnregisterResource(g_texture_cube.cudaResource);
    getLastCudaError("cudaGraphicsUnregisterResource (g_texture_cube) failed");
    cudaFree(g_texture_cube.cudaLinearMemory);
    getLastCudaError("cudaFree (g_texture_2d) failed");

    cudaGraphicsUnregisterResource(g_texture_vol.cudaResource);
    getLastCudaError("cudaGraphicsUnregisterResource (g_texture_vol) failed");
    cudaFree(g_texture_vol.cudaLinearMemory);
    getLastCudaError("cudaFree (g_texture_vol) failed");


    //
    // clean up Direct3D
    //
    {
        // release the resources we created
        g_texture_2d.pTexture->Release();
        g_texture_cube.pTexture->Release();
        g_texture_vol.pTexture->Release();
    }

    return S_OK;
}
Esempio n. 11
0
//-----------------------------------------------------------------------------
// Name: Cleanup()
// Desc: Releases all previously initialized objects
//-----------------------------------------------------------------------------
VOID Cleanup()
{
    if (g_pVB != NULL)
    {

        // Unregister vertex buffer
        // DEPRECATED: checkCudaErrors(cudaD3D10UnregisterResource(g_pVB));
        cudaGraphicsUnregisterResource(cuda_VB_resource);
        getLastCudaError("cudaGraphicsUnregisterResource failed");

        g_pVB->Release();
    }

    if (g_pInputLayout != NULL)
        g_pInputLayout->Release();

    if (g_pSimpleEffect != NULL)
        g_pSimpleEffect->Release();

    if (g_pSwapChainRTV != NULL)
        g_pSwapChainRTV->Release();

    if (g_pSwapChain != NULL)
        g_pSwapChain->Release();

    // Uninitialize CUDA

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    getLastCudaError("cudaDeviceReset failed");

    if (g_pd3dDevice != NULL)
        g_pd3dDevice->Release();


}
Esempio n. 12
0
HRESULT ReleaseCUDA()
{
    // Uninitialize CUDA

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    getLastCudaError("cudaDeviceReset failed");
    return S_OK;
}
Esempio n. 13
0
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda()
{
    // Map vertex buffer to Cuda
    float4 *d_ptr;

    // CUDA Map call to the Vertex Buffer and return a pointer
    // DEPRECATED: cudaD3D10MapResources(1, (ID3D10Resource **)&g_pVB);
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_VB_resource, 0));
    getLastCudaError("cudaGraphicsMapResources failed");

    // DEPRECATED: cudaD3D10ResourceGetMappedPointer( (void **)&dptr, g_pVB, 0);
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_ptr, &num_bytes, cuda_VB_resource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer failed");

    // Execute kernel
    simpleD3DKernel(d_ptr, g_MeshWidth, g_MeshHeight, anim);

    // CUDA Map Unmap vertex buffer
    // DEPRECATED: cudaD3D10UnmapResources(1, (ID3D10Resource **)&g_pVB);
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_VB_resource, 0));
    getLastCudaError("cudaGraphicsUnmapResource failed");
}
Esempio n. 14
0
// This test specifies a single test (where you specify radius and/or iterations)
int runSingleTest(char *ref_file, char *exec_path)
{
    int nTotalErrors = 0;
    char dump_file[256];

    printf("[runSingleTest]: [%s]\n", sSDKsample);

    initCuda();

    unsigned int *dResult;
    unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int));
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));

    // run the sample radius
    {
        printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations);
        bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

        // check if kernel execution generated an error
        getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
        checkCudaErrors(cudaDeviceSynchronize());

        // readback the results to system memory
        cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch,
                     sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost);

        sprintf(dump_file, "nature_%02d.ppm", filter_radius);

        sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height);

        if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false))
        {
            printf("Image is Different ");
            nTotalErrors++;
        }
        else
        {
            printf("Image is Matching ");
        }

        printf(" <%s>\n", ref_file);
    }
    printf("\n");

    free(hResult);
    checkCudaErrors(cudaFree(dResult));

    return nTotalErrors;
}
void runImageFilters(TColor *d_dst)
{
    switch (g_Kernel)
    {
        case 0:
            cuda_Copy(d_dst, imageW, imageH);
            break;

        case 1:
            if (!g_Diag)
            {
                cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC);
            }
            else
            {
                cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC);
            }

            break;

        case 2:
            if (!g_Diag)
            {
                cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
            }
            else
            {
                cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
            }

            break;

        case 3:
            if (!g_Diag)
            {
                cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
            }
            else
            {
                cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC);
            }

            break;
    }

    getLastCudaError("Filtering kernel execution failed.\n");
}
bool
runSingleTest(const char *ref_file, const char *exec_path)
{
    // allocate memory for result
    int nTotalErrors = 0;
    unsigned int *d_result;
    unsigned int size = width * height * sizeof(unsigned int);
    checkCudaErrors(cudaMalloc((void **) &d_result, size));

    // warm-up
    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStartTimer(&timer);

    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("Kernel execution failed");
    sdkStopTimer(&timer);

    unsigned char *h_result = (unsigned char *)malloc(width*height*4);
    checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost));

    char dump_file[1024];
    sprintf(dump_file, "lena_%02d.ppm", (int)sigma);
    sdkSavePPM4ub(dump_file, h_result, width, height);

    if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false))
    {
        nTotalErrors++;
    }

    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    printf("Summary: %d errors!\n", nTotalErrors);

    printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n");
    return (nTotalErrors == 0);
}
Esempio n. 17
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runBenchmark(int argc, char **argv)
{
    printf("[runBenchmark]: [%s]\n", sSDKsample);

    loadImageData(argc, argv);
    initCuda();

    unsigned int *dResult;
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));
    sdkStartTimer(&kernel_timer);

    // warm-up
    bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    checkCudaErrors(cudaDeviceSynchronize());

    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles);

    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);
    }

    // check if kernel execution generated an error and sync host
    getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&kernel_timer);

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n",
           (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1);
    printf("\n");

    return 0;
}
Esempio n. 18
0
// render image using CUDA
void render()
{
    // map PBO to get CUDA device pointer
    g_GraphicsMapFlag++;
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

    // call CUDA kernel, writing results to PBO
    render_kernel(gridSize, blockSize, d_output, width, height, w);

    getLastCudaError("render_kernel failed");

    if (g_GraphicsMapFlag)
    {
        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
        g_GraphicsMapFlag--;
    }
}
Esempio n. 19
0
void runAutoTest(int argc, char **argv, const char *dump_filename, eFilterMode filter_mode)
{
    cudaDeviceProp deviceProps;

    int devID = findCudaDevice(argc, (const char **)argv);

    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));

    printf("[%s] (automated testing w/ readback)\n", sSDKsample);
    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

    loadImageData(argc, argv);

    uchar4 *d_output;
    checkCudaErrors(cudaMalloc((void **)&d_output, imageWidth*imageHeight*4));
    unsigned int *h_result = (unsigned int *)malloc(width * height * sizeof(unsigned int));

    printf("AutoTest: %s Filter Mode: <%s>\n", sSDKsample, sFilterMode[g_FilterMode]);

    render(imageWidth, imageHeight,
           tx, ty, scale, cx, cy,
           blockSize, gridSize, filter_mode, d_output);

    // check if kernel execution generated an error
    getLastCudaError("Error: render (bicubicTexture) Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());

    cudaMemcpy(h_result, d_output, imageWidth*imageHeight*4, cudaMemcpyDeviceToHost);

    sdkSavePPM4ub(dump_filename, (unsigned char *)h_result, imageWidth, imageHeight);

    checkCudaErrors(cudaFree(d_output));
    free(h_result);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple benchmark test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runBenchmark()
{
    printf("[runBenchmark]: [%s]\n", sSDKsample);

    initCuda(true);

    unsigned int *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, width*height*sizeof(unsigned int)));

    // warm-up
    boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer);
    checkCudaErrors(cudaDeviceSynchronize());

    sdkStartTimer(&kernel_timer);
    // Start round-trip timer and process iCycles loops on the GPU
    iterations = 1;     // standard 1-pass filtering
    const int iCycles = 150;
    double dProcessingTime = 0.0;
    printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles);

    for (int i = 0; i < iCycles; i++)
    {
        dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads, kernel_timer);
    }

    // check if kernel execution generated an error and sync host
    getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&kernel_timer);

    // Get average computation time
    dProcessingTime /= (double)iCycles;

    // log testname, throughput, timing and config info to sample and master logs
    printf("boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n",
           (1.0e-6 * width * height)/dProcessingTime, dProcessingTime,
           (width * height), 1, nthreads);
    printf("\n");

    return 0;
}
Esempio n. 21
0
HRESULT RegisterD3D9ResourceWithCUDA()
{
    // 2D
    // register the Direct3D resources that we'll use
    // we'll read to and write from g_texture_2d, so don't set any special map flags for it
    cudaGraphicsD3D9RegisterResource(&g_texture_2d.cudaResource, g_texture_2d.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_2d) failed");
    // cuda cannot write into the texture directly : the texture is seen as a cudaArray and can only be mapped as a texture
    // Create a buffer so that cuda can write into it
    // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT
    cudaMallocPitch(&g_texture_2d.cudaLinearMemory, &g_texture_2d.pitch, g_texture_2d.width * sizeof(float) * 4, g_texture_2d.height);
    getLastCudaError("cudaMallocPitch (g_texture_2d) failed");
    cudaMemset(g_texture_2d.cudaLinearMemory, 1, g_texture_2d.pitch * g_texture_2d.height);

    // CUBE
    cudaGraphicsD3D9RegisterResource(&g_texture_cube.cudaResource, g_texture_cube.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_cube) failed");
    // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM
    cudaMallocPitch(&g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, g_texture_cube.size * 4, g_texture_cube.size);
    getLastCudaError("cudaMallocPitch (g_texture_cube) failed");
    cudaMemset(g_texture_cube.cudaLinearMemory, 1, g_texture_cube.pitch * g_texture_cube.size);
    getLastCudaError("cudaMemset (g_texture_cube) failed");

    // 3D
    cudaGraphicsD3D9RegisterResource(&g_texture_vol.cudaResource, g_texture_vol.pTexture, cudaGraphicsRegisterFlagsNone);
    getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_vol) failed");
    // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM
    //cudaMallocPitch(&g_texture_vol.cudaLinearMemory, &g_texture_vol.pitch, g_texture_vol.width * 4, g_texture_vol.height * g_texture_vol.depth);
    cudaMalloc(&g_texture_vol.cudaLinearMemory, g_texture_vol.width * 4 * g_texture_vol.height * g_texture_vol.depth);
    g_texture_vol.pitch = g_texture_vol.width * 4;
    getLastCudaError("cudaMallocPitch (g_texture_vol) failed");
    cudaMemset(g_texture_vol.cudaLinearMemory, 1, g_texture_vol.pitch * g_texture_vol.height * g_texture_vol.depth);
    getLastCudaError("cudaMemset (g_texture_vol) failed");

    return S_OK;
}
// render image using CUDA
void render()
{

    VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);

    // map PBO to get CUDA device pointer
    uint *d_output;
    // map PBO to get CUDA device pointer
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes,
                                                         cuda_pbo_resource));
    //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

    // clear image
    checkCudaErrors(cudaMemset(d_output, 0, width*height*4));

    // call CUDA kernel, writing results to PBO
    VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);

    getLastCudaError("render kernel failed");

    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
}
Esempio n. 23
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    devID = findCudaGLDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");

    if (ref_file)
    {
        autoTest(argv);
        cleanup();
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
        atexit(cleanup);
        glutMainLoop();
    }

    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
//-----------------------------------------------------------------------------
// Name: Cleanup()
// Desc: Releases all previously initialized objects
//-----------------------------------------------------------------------------
VOID Cleanup()
{
    if (g_histogram.pBuffer != NULL)
    {
        // Unregister vertex buffer
        cudaGraphicsUnregisterResource(g_histogram.cudaResource);
        getLastCudaError("cudaGraphicsUnregisterResource failed");
        g_histogram.pBuffer->Release();
    }

    if (g_histogram.pBufferSRV != NULL)
    {
        g_histogram.pBufferSRV->Release();
    }

    if (g_pDisplayEffect != NULL)
    {
        g_pDisplayEffect->Release();
    }

    if (g_pCompositeEffect != NULL)
    {
        g_pCompositeEffect->Release();
    }

    if (g_color.pBufferSRV != NULL)
    {
        g_color.pBufferSRV->Release();
    }

    if (g_color.pBufferRTV != NULL)
    {
        g_color.pBufferRTV->Release();
    }

    if (g_color.pBuffer != NULL)
    {
        // Unregister vertex buffer
        cudaGraphicsUnregisterResource(g_color.cudaResource);
        getLastCudaError("cudaD3D10UnregisterResource failed");
        g_color.pBuffer->Release();
    }

    if (g_pRasterState != NULL)
    {
        g_pRasterState->Release();
    }

    if (g_pSwapChainRTV != NULL)
    {
        g_pSwapChainRTV->Release();
    }

    if (g_pSwapChain != NULL)
    {
        g_pSwapChain->Release();
    }

    // Uninitialize CUDA

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    getLastCudaError("cudaDeviceReset failed");

    if (g_pd3dDevice != NULL)
    {
        g_pd3dDevice->Release();
    }

}
T benchmarkReduce(int  n,
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel,
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  StopWatchInterface *timer,
                  T *h_odata,
                  T *d_idata,
                  T *d_odata)
{
    T gpu_result = 0;
    bool needReadBack = true;

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;

        cudaDeviceSynchronize();
        sdkStartTimer(&timer);

        // execute the kernel
        reduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);

        // check if kernel execution generated an error
        getLastCudaError("Kernel execution failed");

        if (cpuFinalReduction)
        {
            // sum partial sums from each block on CPU
            // copy result from device to host
            checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost));

            for (int i=0; i<numBlocks; i++)
            {
                gpu_result += h_odata[i];
            }

            needReadBack = false;
        }
        else
        {
            // sum partial block sums on GPU
            int s=numBlocks;
            int kernel = whichKernel;

            while (s > cpuFinalThreshold)
            {
                int threads = 0, blocks = 0;
                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);

                reduce<T>(s, threads, blocks, kernel, d_odata, d_odata);

                if (kernel < 3)
                {
                    s = (s + threads - 1) / threads;
                }
                else
                {
                    s = (s + (threads*2-1)) / (threads*2);
                }
            }

            if (s > 1)
            {
                // copy result from device to host
                checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost));

                for (int i=0; i < s; i++)
                {
                    gpu_result += h_odata[i];
                }

                needReadBack = false;
            }
        }

        cudaDeviceSynchronize();
        sdkStopTimer(&timer);
    }

    if (needReadBack)
    {
        // copy final sum from device to host
        checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost));
    }

    return gpu_result;
}
Esempio n. 26
0
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void RunKernels()
{
    static float t = 0.0f;

    // populate the 2d texture
    {
        cudaArray *cuArray;
        cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_2d.cudaResource, 0, 0);
        getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_2d) failed");

        // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it
        cuda_texture_2d(g_texture_2d.cudaLinearMemory, g_texture_2d.width, g_texture_2d.height, g_texture_2d.pitch, t);
        getLastCudaError("cuda_texture_2d failed");

        // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray
        cudaMemcpy2DToArray(
            cuArray, // dst array
            0, 0,    // offset
            g_texture_2d.cudaLinearMemory, g_texture_2d.pitch,       // src
            g_texture_2d.width*4*sizeof(float), g_texture_2d.height, // extent
            cudaMemcpyDeviceToDevice); // kind
        getLastCudaError("cudaMemcpy2DToArray failed");
    }
    // populate the volume texture
    {
        size_t pitchSlice = g_texture_vol.pitch * g_texture_vol.height;
        cudaArray *cuArray;
        cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_vol.cudaResource, 0, 0);
        getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_3d) failed");

        // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it
        cuda_texture_volume(g_texture_vol.cudaLinearMemory, g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth, g_texture_vol.pitch, pitchSlice, t);
        getLastCudaError("cuda_texture_3d failed");

        // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray
        struct cudaMemcpy3DParms memcpyParams = {0};
        memcpyParams.dstArray = cuArray;
        memcpyParams.srcPtr.ptr = g_texture_vol.cudaLinearMemory;
        memcpyParams.srcPtr.pitch = g_texture_vol.pitch;
        memcpyParams.srcPtr.xsize = g_texture_vol.width;
        memcpyParams.srcPtr.ysize = g_texture_vol.height;
        memcpyParams.extent.width = g_texture_vol.width;
        memcpyParams.extent.height = g_texture_vol.height;
        memcpyParams.extent.depth = g_texture_vol.depth;
        memcpyParams.kind = cudaMemcpyDeviceToDevice;
        cudaMemcpy3D(&memcpyParams);
        getLastCudaError("cudaMemcpy3D failed");
    }

    // populate the faces of the cube map
    for (int face = 0; face < 6; ++face)
    {
        cudaArray *cuArray;
        cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_cube.cudaResource, face, 0);
        getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_cube) failed");

        // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it
        cuda_texture_cube(g_texture_cube.cudaLinearMemory, g_texture_cube.size, g_texture_cube.size, g_texture_cube.pitch, face, t);
        getLastCudaError("cuda_texture_cube failed");

        // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray
        cudaMemcpy2DToArray(
            cuArray, // dst array
            0, 0,    // offset
            g_texture_cube.cudaLinearMemory, g_texture_cube.pitch, // src
            g_texture_cube.size*4, g_texture_cube.size,            // extent
            cudaMemcpyDeviceToDevice); // kind
        getLastCudaError("cudaMemcpy2DToArray failed");
    }

    t += 0.1f;
}
//////////////////////////////////////////////////////////////////////////
// AUTOMATIC TESTING
void runSingleTest(const char *ref_file, const char *exec_path)
{
    uint *d_output;
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(uint)));
    checkCudaErrors(cudaMemset(d_output, 0, width*height*sizeof(uint)));

    float modelView[16] =
    {
        1.0f, 0.0f, 0.0f, 0.0f,
        0.0f, 1.0f, 0.0f, 0.0f,
        0.0f, 0.0f, 1.0f, 0.0f,
        0.0f, 0.0f, 4.0f, 1.0f
    };

    invViewMatrix[0] = modelView[0];
    invViewMatrix[1] = modelView[4];
    invViewMatrix[2] = modelView[8];
    invViewMatrix[3] = modelView[12];
    invViewMatrix[4] = modelView[1];
    invViewMatrix[5] = modelView[5];
    invViewMatrix[6] = modelView[9];
    invViewMatrix[7] = modelView[13];
    invViewMatrix[8] = modelView[2];
    invViewMatrix[9] = modelView[6];
    invViewMatrix[10] = modelView[10];
    invViewMatrix[11] = modelView[14];

    // call CUDA kernel, writing results to PBO
    VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
    filterAnimation = false;

    // Start timer 0 and process n loops on the GPU
    int nIter = 10;
    float scale = 2.0f/float(nIter-1);

    for (int i = -1; i < nIter; i++)
    {
        if (i == 0)
        {
            cudaDeviceSynchronize();
            sdkStartTimer(&timer);
        }

        filterFactor = (float(i) * scale) - 1.0f;
        filterFactor = -filterFactor;
        filter();
        VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    // Get elapsed time and throughput, then log to sample and master logs
    double dAvgTime = sdkGetTimerValue(&timer)/(nIter * 1000.0);
    printf("volumeFiltering, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n",
           (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y);


    getLastCudaError("Error: kernel execution FAILED");
    checkCudaErrors(cudaDeviceSynchronize());

    unsigned char *h_output = (unsigned char *)malloc(width*height*4);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*4, cudaMemcpyDeviceToHost));

    sdkSavePPM4ub("volumefilter.ppm", h_output, width, height);
    bool bTestResult = sdkComparePPM("volumefilter.ppm", sdkFindFilePath(ref_file, exec_path),
                                     MAX_EPSILON_ERROR, THRESHOLD, true);

    checkCudaErrors(cudaFree(d_output));
    free(h_output);
    cleanup();

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
Esempio n. 28
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
#ifndef OPTIMUS
    devID = findCudaGLDevice(argc, (const char **)argv);
#else
    devID = gpuGetMaxGflopsDeviceId();
#endif

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array in host memory
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

#ifdef BROADCAST
	int step = 1;

	// Broadcasted visualization stepping.
	if (argc > 3)
		step = atoi(argv[3]);

	// Create additional space to store particle packets
	// for broadcasting.
	wstep = step; hstep = step;
	int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize;
	if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize)
		npackets++;
	packets = (char*)malloc(npackets *
		(UdpBroadcastServer::PacketSize + sizeof(unsigned int)));
#endif

    initParticles(particles, DIM, DIM);

#if defined(OPTIMUS) || defined(BROADCAST)
    // Create particle array in device memory
    cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS);
    cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice);
#endif

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

#ifndef OPTIMUS
    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
#endif

    if (ref_file)
    {
        autoTest(argv);
        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
#ifdef BROADCAST
		const char *sv_addr = "127.0.0:9097";
		const char *bc_addr = "127.255.255.2:9097";

		// Server address
		if (argc > 2)
			sv_addr = argv[2];

		// Broadcast address
		if (argc > 1)
			bc_addr = argv[1];

		server.reset(new UdpBroadcastServer(sv_addr, bc_addr));

		// Listen to clients' feedbacks in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &feedback_listener, &step);
		}

		// Broadcast the particles state in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &broadcaster, &step);
		}
#endif
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif
        glutMainLoop();
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Esempio n. 29
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv, char *ref_file)
{
    // Register the window class
    WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L,
                      GetModuleHandle(NULL), NULL, NULL, NULL, NULL,
                      "CUDA/D3D10 simpleD3D10", NULL
                    };
    RegisterClassEx(&wc);

    // Create the application's window
    int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME);
    int yMenu = ::GetSystemMetrics(SM_CYMENU);
    int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME);
    HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D10 simpleD3D10",
                             WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu,
                             NULL, NULL, wc.hInstance, NULL);

    // Initialize Direct3D
    if (SUCCEEDED(InitD3D(hWnd)))
    {
        // Create the scene geometry
        if (SUCCEEDED(InitGeometry()))
        {
            // Initialize interoperability between CUDA and Direct3D
            // Register vertex buffer with CUDA
            // DEPRECATED: cudaD3D10RegisterResource(g_pVB, cudaD3D10RegisterFlagsNone);
            cudaGraphicsD3D10RegisterResource(&cuda_VB_resource, g_pVB, cudaD3D10RegisterFlagsNone);
            getLastCudaError("cudaGraphicsD3D10RegisterResource (g_pVB) failed");

            // Initialize vertex buffer with CUDA
            runCuda();

            // Save result
            SaveResult(argc, argv);

            // Show the window
            ShowWindow(hWnd, SW_SHOWDEFAULT);
            UpdateWindow(hWnd);

            // Enter the message loop
            MSG msg;
            ZeroMemory(&msg, sizeof(msg));

            while (msg.message!=WM_QUIT)
            {
                if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE))
                {
                    TranslateMessage(&msg);
                    DispatchMessage(&msg);
                }
                else
                {
                    Render();

                    if (ref_file != NULL)
                    {
                        for (int count=0; count<g_iFrameToCompare; count++)
                        {
                            Render();
                        }

                        const char *cur_image_path = "simpleD3D10.ppm";

                        // Save a reference of our current test run image
                        CheckRenderD3D10::ActiveRenderTargetToPPM(g_pd3dDevice,cur_image_path);

                        // compare to offical reference image, printing PASS or FAIL.
                        g_bPassed = CheckRenderD3D10::PPMvsPPM(cur_image_path, ref_file, argv[0],MAX_EPSILON, 0.15f);

                        Cleanup();

                        PostQuitMessage(0);
                    }
                }
            }
        }
    }

    // Release D3D Library (after message loop)
    dynlinkUnloadD3D10API();

    UnregisterClass(wc.lpszClassName, wc.hInstance);
}
Esempio n. 30
0
//-----------------------------------------------------------------------------
// Name: InitD3D()
// Desc: Initializes Direct3D
//-----------------------------------------------------------------------------
HRESULT InitD3D(HWND hWnd)
{
    // Set up the structure used to create the device and swapchain
    DXGI_SWAP_CHAIN_DESC sd;
    ZeroMemory(&sd, sizeof(sd));
    sd.BufferCount = 1;
    sd.BufferDesc.Width = g_WindowWidth;
    sd.BufferDesc.Height = g_WindowHeight;
    sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
    sd.BufferDesc.RefreshRate.Numerator = 60;
    sd.BufferDesc.RefreshRate.Denominator = 1;
    sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
    sd.OutputWindow = hWnd;
    sd.SampleDesc.Count = 1;
    sd.SampleDesc.Quality = 0;
    sd.Windowed = TRUE;

    // Create device and swapchain
    HRESULT hr = sFnPtr_D3D10CreateDeviceAndSwapChain(
                     g_pCudaCapableAdapter,
                     D3D10_DRIVER_TYPE_HARDWARE,
                     NULL,
                     0,
                     D3D10_SDK_VERSION,
                     &sd,
                     &g_pSwapChain,
                     &g_pd3dDevice);
    AssertOrQuit(SUCCEEDED(hr));
    g_pCudaCapableAdapter->Release();

    // Create a render target view of the swapchain
    ID3D10Texture2D *pBuffer;
    hr = g_pSwapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (LPVOID *)&pBuffer);
    AssertOrQuit(SUCCEEDED(hr));

    hr = g_pd3dDevice->CreateRenderTargetView(pBuffer, NULL, &g_pSwapChainRTV);
    AssertOrQuit(SUCCEEDED(hr));
    pBuffer->Release();

    g_pd3dDevice->OMSetRenderTargets(1, &g_pSwapChainRTV, NULL);

    // Setup the viewport
    D3D10_VIEWPORT vp;
    vp.Width = g_WindowWidth;
    vp.Height = g_WindowHeight;
    vp.MinDepth = 0.0f;
    vp.MaxDepth = 1.0f;
    vp.TopLeftX = 0;
    vp.TopLeftY = 0;
    g_pd3dDevice->RSSetViewports(1, &vp);

    // Setup the effect
    {
        ID3D10Blob *pCompiledEffect;
        hr = sFnPtr_D3D10CompileEffectFromMemory(
                 (void *)g_simpleEffectSrc,
                 sizeof(g_simpleEffectSrc),
                 NULL,
                 NULL, // pDefines
                 NULL, // pIncludes
                 0, // HLSL flags
                 0, // FXFlags
                 &pCompiledEffect,
                 NULL);
        AssertOrQuit(SUCCEEDED(hr));

        hr = sFnPtr_D3D10CreateEffectFromMemory(
                 pCompiledEffect->GetBufferPointer(),
                 pCompiledEffect->GetBufferSize(),
                 0, // FXFlags
                 g_pd3dDevice,
                 NULL,
                 &g_pSimpleEffect);
        pCompiledEffect->Release();

        g_pSimpleTechnique = g_pSimpleEffect->GetTechniqueByName("Render");

        //    g_pmWorldViewProjection = g_pSimpleEffect->GetVariableByName("g_mWorldViewProjection")->AsMatrix();
        g_pmWorld = g_pSimpleEffect->GetVariableByName("g_mWorld")->AsMatrix();
        g_pmView = g_pSimpleEffect->GetVariableByName("g_mView")->AsMatrix();
        g_pmProjection = g_pSimpleEffect->GetVariableByName("g_mProjection")->AsMatrix();

        // Define the input layout
        D3D10_INPUT_ELEMENT_DESC layout[] =
        {
            { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D10_INPUT_PER_VERTEX_DATA, 0 },
            { "COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 12, D3D10_INPUT_PER_VERTEX_DATA, 0 },
        };
        UINT numElements = sizeof(layout)/sizeof(layout[0]);

        // Create the input layout
        D3D10_PASS_DESC PassDesc;
        g_pSimpleTechnique->GetPassByIndex(0)->GetDesc(&PassDesc);
        hr = g_pd3dDevice->CreateInputLayout(layout, numElements, PassDesc.pIAInputSignature, PassDesc.IAInputSignatureSize, &g_pInputLayout);
        AssertOrQuit(SUCCEEDED(hr));

        // Setup Input Layout, apply effect and draw points
        g_pd3dDevice->IASetInputLayout(g_pInputLayout);
        g_pSimpleTechnique->GetPassByIndex(0)->Apply(0);
        g_pd3dDevice->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_POINTLIST);

    }

    // begin interop
    cudaD3D10SetDirect3DDevice(g_pd3dDevice);
    getLastCudaError("cudaD3D10SetDirect3DDevice failed");

    return S_OK;
}