void keyboard(unsigned char key, int x, int y) { switch (key) { case 27: g_bExitESC = true; exit(EXIT_SUCCESS); break; case 'r': memset(hvfield, 0, sizeof(cData) * DS); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); initParticles(particles, DIM, DIM); cudaGraphicsUnregisterResource(cuda_vbo_resource); getLastCudaError("cudaGraphicsUnregisterBuffer failed"); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); break; default: break; } }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void runCuda() { cudaStream_t stream = 0; const int nbResources = 2; cudaGraphicsResource *ppResources[nbResources] = { g_histogram.cudaResource, g_color.cudaResource, }; // Map resources for Cuda checkCudaErrors(cudaGraphicsMapResources(nbResources, ppResources, stream)); getLastCudaError("cudaGraphicsMapResources(2) failed"); // Get pointers checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_histogram.cudaBuffer, &g_histogram.size, g_histogram.cudaResource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer (g_color.pBuffer) failed"); cudaGraphicsSubResourceGetMappedArray(&g_color.pCudaArray, g_color.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (g_color.pBuffer) failed"); // Execute kernel createHistogramTex(g_histogram.cudaBuffer, g_WindowWidth, g_WindowHeight, g_color.pCudaArray); checkCudaError(); // // unmap the resources // checkCudaErrors(cudaGraphicsUnmapResources(nbResources, ppResources, stream)); getLastCudaError("cudaGraphicsUnmapResources(2) failed"); }
//----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- VOID Cleanup() { if (g_pVB != NULL) { // Unregister vertex buffer // DEPRECATED: checkCudaErrors(cudaD3D10UnregisterResource(g_pVB)); cudaGraphicsUnregisterResource(cuda_VB_resource); getLastCudaError("cudaGraphicsUnregisterResource failed"); g_pVB->Release(); } if (g_pInputLayout != NULL) g_pInputLayout->Release(); if (g_pSimpleEffect != NULL) g_pSimpleEffect->Release(); if (g_pSwapChainRTV != NULL) g_pSwapChainRTV->Release(); if (g_pSwapChain != NULL) g_pSwapChain->Release(); // Uninitialize CUDA cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); if (g_pd3dDevice != NULL) g_pd3dDevice->Release(); }
//----------------------------------------------------------------------------- // Name: RunCUDA() // Desc: Launches the CUDA kernels to fill in the texture data //----------------------------------------------------------------------------- void RunCUDA() { // // map the resources we've registered so we can access them in Cuda // - it is most efficient to map and unmap all resources in a single call, // and to have the map/unmap calls be the boundary between using the GPU // for Direct3D and Cuda // if (!g_bDeviceLost) { cudaStream_t stream = 0; const int nbResources = 3; cudaGraphicsResource *ppResources[nbResources] = { g_texture_2d.cudaResource, g_texture_vol.cudaResource, g_texture_cube.cudaResource, }; cudaGraphicsMapResources(nbResources, ppResources, stream); getLastCudaError("cudaGraphicsMapResources(3) failed"); // // run kernels which will populate the contents of those textures // RunKernels(); // // unmap the resources // cudaGraphicsUnmapResources(nbResources, ppResources, stream); getLastCudaError("cudaGraphicsUnmapResources(3) failed"); } }
void displayFunc(void) { sdkStartTimer(&timer); TColor *d_dst = NULL; size_t num_bytes; if (frameCounter++ == 0) { sdkResetTimer(&timer); } // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); getLastCudaError("cudaGraphicsMapResources failed"); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); checkCudaErrors(CUDA_Bind2TextureArray()); runImageFilters(d_dst); checkCudaErrors(CUDA_UnbindTexture()); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (frameCounter == frameN) { frameCounter = 0; if (g_FPS) { printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); g_FPS = false; } } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
void keyboard(unsigned char key, int x, int y) { switch (key) { case 27: g_bExitESC = true; exit(EXIT_SUCCESS); break; case 'f': if (!fullscreen) { fullscreen = 1; glutFullScreenToggle(); } else { fullscreen = 0; glutLeaveFullScreen(); } break; case 'r': pthread_mutex_lock(&display_mutex); memset(hvfield, 0, sizeof(cData) * DS); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); initParticles(particles, DIM, DIM); #ifndef OPTIMUS cudaGraphicsUnregisterResource(cuda_vbo_resource); getLastCudaError("cudaGraphicsUnregisterBuffer failed"); #endif #if defined(OPTIMUS) || defined(BROADCAST) cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif pthread_mutex_unlock(&display_mutex); break; default: break; } }
void runAutoTest(const char *ref_file, char *exec_path) { checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4)); // render the volumeData render_kernel(gridSize, blockSize, d_output, width, height, w); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("render_kernel failed"); void *h_output = malloc(width*height*sizeof(GLubyte)*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost)); sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin"); bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); checkCudaErrors(cudaFree(d_output)); free(h_output); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); sdkStopTimer(&timer); sdkDeleteTimer(&timer); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); // execute the kernel for (int i = 0; i < iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); }
HRESULT InitCUDA() { printf("InitCUDA() g_pD3DDevice = %p\n", g_pD3DDevice); // Now we need to bind a CUDA context to the DX9 device // This is the CUDA 2.0 DX9 interface (required for Windows XP and Vista) cudaD3D9SetDirect3DDevice(g_pD3DDevice); getLastCudaError("cudaD3D9SetDirect3DDevice failed"); return S_OK; }
//----------------------------------------------------------------------------- // Name: ReleaseTextures() // Desc: Release Direct3D Textures (free-ing) //----------------------------------------------------------------------------- HRESULT ReleaseTextures() { // unregister the Cuda resources cudaGraphicsUnregisterResource(g_texture_2d.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource (g_texture_2d) failed"); cudaFree(g_texture_2d.cudaLinearMemory); getLastCudaError("cudaFree (g_texture_2d) failed"); cudaGraphicsUnregisterResource(g_texture_cube.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource (g_texture_cube) failed"); cudaFree(g_texture_cube.cudaLinearMemory); getLastCudaError("cudaFree (g_texture_2d) failed"); cudaGraphicsUnregisterResource(g_texture_vol.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource (g_texture_vol) failed"); cudaFree(g_texture_vol.cudaLinearMemory); getLastCudaError("cudaFree (g_texture_vol) failed"); // // clean up Direct3D // { // release the resources we created g_texture_2d.pTexture->Release(); g_texture_cube.pTexture->Release(); g_texture_vol.pTexture->Release(); } return S_OK; }
//----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- VOID Cleanup() { if (g_pVB != NULL) { // Unregister vertex buffer // DEPRECATED: checkCudaErrors(cudaD3D10UnregisterResource(g_pVB)); cudaGraphicsUnregisterResource(cuda_VB_resource); getLastCudaError("cudaGraphicsUnregisterResource failed"); g_pVB->Release(); } if (g_pInputLayout != NULL) g_pInputLayout->Release(); if (g_pSimpleEffect != NULL) g_pSimpleEffect->Release(); if (g_pSwapChainRTV != NULL) g_pSwapChainRTV->Release(); if (g_pSwapChain != NULL) g_pSwapChain->Release(); // Uninitialize CUDA // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); if (g_pd3dDevice != NULL) g_pd3dDevice->Release(); }
HRESULT ReleaseCUDA() { // Uninitialize CUDA // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); return S_OK; }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void runCuda() { // Map vertex buffer to Cuda float4 *d_ptr; // CUDA Map call to the Vertex Buffer and return a pointer // DEPRECATED: cudaD3D10MapResources(1, (ID3D10Resource **)&g_pVB); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_VB_resource, 0)); getLastCudaError("cudaGraphicsMapResources failed"); // DEPRECATED: cudaD3D10ResourceGetMappedPointer( (void **)&dptr, g_pVB, 0); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_ptr, &num_bytes, cuda_VB_resource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); // Execute kernel simpleD3DKernel(d_ptr, g_MeshWidth, g_MeshHeight, anim); // CUDA Map Unmap vertex buffer // DEPRECATED: cudaD3D10UnmapResources(1, (ID3D10Resource **)&g_pVB); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_VB_resource, 0)); getLastCudaError("cudaGraphicsUnmapResource failed"); }
// This test specifies a single test (where you specify radius and/or iterations) int runSingleTest(char *ref_file, char *exec_path) { int nTotalErrors = 0; char dump_file[256]; printf("[runSingleTest]: [%s]\n", sSDKsample); initCuda(); unsigned int *dResult; unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); // run the sample radius { printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // check if kernel execution generated an error getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); // readback the results to system memory cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch, sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost); sprintf(dump_file, "nature_%02d.ppm", filter_radius); sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { printf("Image is Different "); nTotalErrors++; } else { printf("Image is Matching "); } printf(" <%s>\n", ref_file); } printf("\n"); free(hResult); checkCudaErrors(cudaFree(dResult)); return nTotalErrors; }
void runImageFilters(TColor *d_dst) { switch (g_Kernel) { case 0: cuda_Copy(d_dst, imageW, imageH); break; case 1: if (!g_Diag) { cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC); } else { cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC); } break; case 2: if (!g_Diag) { cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); } else { cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); } break; case 3: if (!g_Diag) { cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); } else { cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC); } break; } getLastCudaError("Filtering kernel execution failed.\n"); }
bool runSingleTest(const char *ref_file, const char *exec_path) { // allocate memory for result int nTotalErrors = 0; unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer); unsigned char *h_result = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); char dump_file[1024]; sprintf(dump_file, "lena_%02d.ppm", (int)sigma); sdkSavePPM4ub(dump_file, h_result, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { nTotalErrors++; } printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); free(h_result); printf("Summary: %d errors!\n", nTotalErrors); printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); return (nTotalErrors == 0); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// int runBenchmark(int argc, char **argv) { printf("[runBenchmark]: [%s]\n", sSDKsample); loadImageData(argc, argv); initCuda(); unsigned int *dResult; size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); sdkStartTimer(&kernel_timer); // warm-up bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); checkCudaErrors(cudaDeviceSynchronize()); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles); for (int i = 0; i < iCycles; i++) { dProcessingTime += bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); } // check if kernel execution generated an error and sync host getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1); printf("\n"); return 0; }
// render image using CUDA void render() { // map PBO to get CUDA device pointer g_GraphicsMapFlag++; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // call CUDA kernel, writing results to PBO render_kernel(gridSize, blockSize, d_output, width, height, w); getLastCudaError("render_kernel failed"); if (g_GraphicsMapFlag) { checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); g_GraphicsMapFlag--; } }
void runAutoTest(int argc, char **argv, const char *dump_filename, eFilterMode filter_mode) { cudaDeviceProp deviceProps; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("[%s] (automated testing w/ readback)\n", sSDKsample); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); loadImageData(argc, argv); uchar4 *d_output; checkCudaErrors(cudaMalloc((void **)&d_output, imageWidth*imageHeight*4)); unsigned int *h_result = (unsigned int *)malloc(width * height * sizeof(unsigned int)); printf("AutoTest: %s Filter Mode: <%s>\n", sSDKsample, sFilterMode[g_FilterMode]); render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, filter_mode, d_output); // check if kernel execution generated an error getLastCudaError("Error: render (bicubicTexture) Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); cudaMemcpy(h_result, d_output, imageWidth*imageHeight*4, cudaMemcpyDeviceToHost); sdkSavePPM4ub(dump_filename, (unsigned char *)h_result, imageWidth, imageHeight); checkCudaErrors(cudaFree(d_output)); free(h_result); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// int runBenchmark() { printf("[runBenchmark]: [%s]\n", sSDKsample); initCuda(true); unsigned int *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, width*height*sizeof(unsigned int))); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&kernel_timer); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads, kernel_timer); } // check if kernel execution generated an error and sync host getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&kernel_timer); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs printf("boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); printf("\n"); return 0; }
HRESULT RegisterD3D9ResourceWithCUDA() { // 2D // register the Direct3D resources that we'll use // we'll read to and write from g_texture_2d, so don't set any special map flags for it cudaGraphicsD3D9RegisterResource(&g_texture_2d.cudaResource, g_texture_2d.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_2d) failed"); // cuda cannot write into the texture directly : the texture is seen as a cudaArray and can only be mapped as a texture // Create a buffer so that cuda can write into it // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT cudaMallocPitch(&g_texture_2d.cudaLinearMemory, &g_texture_2d.pitch, g_texture_2d.width * sizeof(float) * 4, g_texture_2d.height); getLastCudaError("cudaMallocPitch (g_texture_2d) failed"); cudaMemset(g_texture_2d.cudaLinearMemory, 1, g_texture_2d.pitch * g_texture_2d.height); // CUBE cudaGraphicsD3D9RegisterResource(&g_texture_cube.cudaResource, g_texture_cube.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_cube) failed"); // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM cudaMallocPitch(&g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, g_texture_cube.size * 4, g_texture_cube.size); getLastCudaError("cudaMallocPitch (g_texture_cube) failed"); cudaMemset(g_texture_cube.cudaLinearMemory, 1, g_texture_cube.pitch * g_texture_cube.size); getLastCudaError("cudaMemset (g_texture_cube) failed"); // 3D cudaGraphicsD3D9RegisterResource(&g_texture_vol.cudaResource, g_texture_vol.pTexture, cudaGraphicsRegisterFlagsNone); getLastCudaError("cudaGraphicsD3D9RegisterResource (g_texture_vol) failed"); // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM //cudaMallocPitch(&g_texture_vol.cudaLinearMemory, &g_texture_vol.pitch, g_texture_vol.width * 4, g_texture_vol.height * g_texture_vol.depth); cudaMalloc(&g_texture_vol.cudaLinearMemory, g_texture_vol.width * 4 * g_texture_vol.height * g_texture_vol.depth); g_texture_vol.pitch = g_texture_vol.width * 4; getLastCudaError("cudaMallocPitch (g_texture_vol) failed"); cudaMemset(g_texture_vol.cudaLinearMemory, 1, g_texture_vol.pitch * g_texture_vol.height * g_texture_vol.depth); getLastCudaError("cudaMemset (g_texture_vol) failed"); return S_OK; }
// render image using CUDA void render() { VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // map PBO to get CUDA device pointer uint *d_output; // map PBO to get CUDA device pointer checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // clear image checkCudaErrors(cudaMemset(d_output, 0, width*height*4)); // call CUDA kernel, writing results to PBO VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); getLastCudaError("render kernel failed"); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaGLDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); if (ref_file) { autoTest(argv); cleanup(); cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); cudaDeviceReset(); exit(EXIT_FAILURE); }
//----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- VOID Cleanup() { if (g_histogram.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_histogram.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource failed"); g_histogram.pBuffer->Release(); } if (g_histogram.pBufferSRV != NULL) { g_histogram.pBufferSRV->Release(); } if (g_pDisplayEffect != NULL) { g_pDisplayEffect->Release(); } if (g_pCompositeEffect != NULL) { g_pCompositeEffect->Release(); } if (g_color.pBufferSRV != NULL) { g_color.pBufferSRV->Release(); } if (g_color.pBufferRTV != NULL) { g_color.pBufferRTV->Release(); } if (g_color.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_color.cudaResource); getLastCudaError("cudaD3D10UnregisterResource failed"); g_color.pBuffer->Release(); } if (g_pRasterState != NULL) { g_pRasterState->Release(); } if (g_pSwapChainRTV != NULL) { g_pSwapChainRTV->Release(); } if (g_pSwapChain != NULL) { g_pSwapChain->Release(); } // Uninitialize CUDA // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); if (g_pd3dDevice != NULL) { g_pd3dDevice->Release(); } }
T benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, StopWatchInterface *timer, T *h_odata, T *d_idata, T *d_odata) { T gpu_result = 0; bool needReadBack = true; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; cudaDeviceSynchronize(); sdkStartTimer(&timer); // execute the kernel reduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost)); for (int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = whichKernel; while (s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); reduce<T>(s, threads, blocks, kernel, d_odata, d_odata); if (kernel < 3) { s = (s + threads - 1) / threads; } else { s = (s + (threads*2-1)) / (threads*2); } } if (s > 1) { // copy result from device to host checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost)); for (int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } cudaDeviceSynchronize(); sdkStopTimer(&timer); } if (needReadBack) { // copy final sum from device to host checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost)); } return gpu_result; }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void RunKernels() { static float t = 0.0f; // populate the 2d texture { cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_2d.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_2d) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_2d(g_texture_2d.cudaLinearMemory, g_texture_2d.width, g_texture_2d.height, g_texture_2d.pitch, t); getLastCudaError("cuda_texture_2d failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray cudaMemcpy2DToArray( cuArray, // dst array 0, 0, // offset g_texture_2d.cudaLinearMemory, g_texture_2d.pitch, // src g_texture_2d.width*4*sizeof(float), g_texture_2d.height, // extent cudaMemcpyDeviceToDevice); // kind getLastCudaError("cudaMemcpy2DToArray failed"); } // populate the volume texture { size_t pitchSlice = g_texture_vol.pitch * g_texture_vol.height; cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_vol.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_3d) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_volume(g_texture_vol.cudaLinearMemory, g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth, g_texture_vol.pitch, pitchSlice, t); getLastCudaError("cuda_texture_3d failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray struct cudaMemcpy3DParms memcpyParams = {0}; memcpyParams.dstArray = cuArray; memcpyParams.srcPtr.ptr = g_texture_vol.cudaLinearMemory; memcpyParams.srcPtr.pitch = g_texture_vol.pitch; memcpyParams.srcPtr.xsize = g_texture_vol.width; memcpyParams.srcPtr.ysize = g_texture_vol.height; memcpyParams.extent.width = g_texture_vol.width; memcpyParams.extent.height = g_texture_vol.height; memcpyParams.extent.depth = g_texture_vol.depth; memcpyParams.kind = cudaMemcpyDeviceToDevice; cudaMemcpy3D(&memcpyParams); getLastCudaError("cudaMemcpy3D failed"); } // populate the faces of the cube map for (int face = 0; face < 6; ++face) { cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_cube.cudaResource, face, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_cube) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_cube(g_texture_cube.cudaLinearMemory, g_texture_cube.size, g_texture_cube.size, g_texture_cube.pitch, face, t); getLastCudaError("cuda_texture_cube failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray cudaMemcpy2DToArray( cuArray, // dst array 0, 0, // offset g_texture_cube.cudaLinearMemory, g_texture_cube.pitch, // src g_texture_cube.size*4, g_texture_cube.size, // extent cudaMemcpyDeviceToDevice); // kind getLastCudaError("cudaMemcpy2DToArray failed"); } t += 0.1f; }
////////////////////////////////////////////////////////////////////////// // AUTOMATIC TESTING void runSingleTest(const char *ref_file, const char *exec_path) { uint *d_output; checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(uint))); checkCudaErrors(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); filterAnimation = false; // Start timer 0 and process n loops on the GPU int nIter = 10; float scale = 2.0f/float(nIter-1); for (int i = -1; i < nIter; i++) { if (i == 0) { cudaDeviceSynchronize(); sdkStartTimer(&timer); } filterFactor = (float(i) * scale) - 1.0f; filterFactor = -filterFactor; filter(); VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaDeviceSynchronize(); sdkStopTimer(&timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = sdkGetTimerValue(&timer)/(nIter * 1000.0); printf("volumeFiltering, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); getLastCudaError("Error: kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_output = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*4, cudaMemcpyDeviceToHost)); sdkSavePPM4ub("volumefilter.ppm", h_output, width, height); bool bTestResult = sdkComparePPM("volumefilter.ppm", sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, true); checkCudaErrors(cudaFree(d_output)); free(h_output); cleanup(); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char **argv, char *ref_file) { // Register the window class WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CUDA/D3D10 simpleD3D10", NULL }; RegisterClassEx(&wc); // Create the application's window int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); int yMenu = ::GetSystemMetrics(SM_CYMENU); int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D10 simpleD3D10", WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu, NULL, NULL, wc.hInstance, NULL); // Initialize Direct3D if (SUCCEEDED(InitD3D(hWnd))) { // Create the scene geometry if (SUCCEEDED(InitGeometry())) { // Initialize interoperability between CUDA and Direct3D // Register vertex buffer with CUDA // DEPRECATED: cudaD3D10RegisterResource(g_pVB, cudaD3D10RegisterFlagsNone); cudaGraphicsD3D10RegisterResource(&cuda_VB_resource, g_pVB, cudaD3D10RegisterFlagsNone); getLastCudaError("cudaGraphicsD3D10RegisterResource (g_pVB) failed"); // Initialize vertex buffer with CUDA runCuda(); // Save result SaveResult(argc, argv); // Show the window ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // Enter the message loop MSG msg; ZeroMemory(&msg, sizeof(msg)); while (msg.message!=WM_QUIT) { if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { TranslateMessage(&msg); DispatchMessage(&msg); } else { Render(); if (ref_file != NULL) { for (int count=0; count<g_iFrameToCompare; count++) { Render(); } const char *cur_image_path = "simpleD3D10.ppm"; // Save a reference of our current test run image CheckRenderD3D10::ActiveRenderTargetToPPM(g_pd3dDevice,cur_image_path); // compare to offical reference image, printing PASS or FAIL. g_bPassed = CheckRenderD3D10::PPMvsPPM(cur_image_path, ref_file, argv[0],MAX_EPSILON, 0.15f); Cleanup(); PostQuitMessage(0); } } } } } // Release D3D Library (after message loop) dynlinkUnloadD3D10API(); UnregisterClass(wc.lpszClassName, wc.hInstance); }
//----------------------------------------------------------------------------- // Name: InitD3D() // Desc: Initializes Direct3D //----------------------------------------------------------------------------- HRESULT InitD3D(HWND hWnd) { // Set up the structure used to create the device and swapchain DXGI_SWAP_CHAIN_DESC sd; ZeroMemory(&sd, sizeof(sd)); sd.BufferCount = 1; sd.BufferDesc.Width = g_WindowWidth; sd.BufferDesc.Height = g_WindowHeight; sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; sd.BufferDesc.RefreshRate.Numerator = 60; sd.BufferDesc.RefreshRate.Denominator = 1; sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; sd.OutputWindow = hWnd; sd.SampleDesc.Count = 1; sd.SampleDesc.Quality = 0; sd.Windowed = TRUE; // Create device and swapchain HRESULT hr = sFnPtr_D3D10CreateDeviceAndSwapChain( g_pCudaCapableAdapter, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION, &sd, &g_pSwapChain, &g_pd3dDevice); AssertOrQuit(SUCCEEDED(hr)); g_pCudaCapableAdapter->Release(); // Create a render target view of the swapchain ID3D10Texture2D *pBuffer; hr = g_pSwapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (LPVOID *)&pBuffer); AssertOrQuit(SUCCEEDED(hr)); hr = g_pd3dDevice->CreateRenderTargetView(pBuffer, NULL, &g_pSwapChainRTV); AssertOrQuit(SUCCEEDED(hr)); pBuffer->Release(); g_pd3dDevice->OMSetRenderTargets(1, &g_pSwapChainRTV, NULL); // Setup the viewport D3D10_VIEWPORT vp; vp.Width = g_WindowWidth; vp.Height = g_WindowHeight; vp.MinDepth = 0.0f; vp.MaxDepth = 1.0f; vp.TopLeftX = 0; vp.TopLeftY = 0; g_pd3dDevice->RSSetViewports(1, &vp); // Setup the effect { ID3D10Blob *pCompiledEffect; hr = sFnPtr_D3D10CompileEffectFromMemory( (void *)g_simpleEffectSrc, sizeof(g_simpleEffectSrc), NULL, NULL, // pDefines NULL, // pIncludes 0, // HLSL flags 0, // FXFlags &pCompiledEffect, NULL); AssertOrQuit(SUCCEEDED(hr)); hr = sFnPtr_D3D10CreateEffectFromMemory( pCompiledEffect->GetBufferPointer(), pCompiledEffect->GetBufferSize(), 0, // FXFlags g_pd3dDevice, NULL, &g_pSimpleEffect); pCompiledEffect->Release(); g_pSimpleTechnique = g_pSimpleEffect->GetTechniqueByName("Render"); // g_pmWorldViewProjection = g_pSimpleEffect->GetVariableByName("g_mWorldViewProjection")->AsMatrix(); g_pmWorld = g_pSimpleEffect->GetVariableByName("g_mWorld")->AsMatrix(); g_pmView = g_pSimpleEffect->GetVariableByName("g_mView")->AsMatrix(); g_pmProjection = g_pSimpleEffect->GetVariableByName("g_mProjection")->AsMatrix(); // Define the input layout D3D10_INPUT_ELEMENT_DESC layout[] = { { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D10_INPUT_PER_VERTEX_DATA, 0 }, { "COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 12, D3D10_INPUT_PER_VERTEX_DATA, 0 }, }; UINT numElements = sizeof(layout)/sizeof(layout[0]); // Create the input layout D3D10_PASS_DESC PassDesc; g_pSimpleTechnique->GetPassByIndex(0)->GetDesc(&PassDesc); hr = g_pd3dDevice->CreateInputLayout(layout, numElements, PassDesc.pIAInputSignature, PassDesc.IAInputSignatureSize, &g_pInputLayout); AssertOrQuit(SUCCEEDED(hr)); // Setup Input Layout, apply effect and draw points g_pd3dDevice->IASetInputLayout(g_pInputLayout); g_pSimpleTechnique->GetPassByIndex(0)->Apply(0); g_pd3dDevice->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_POINTLIST); } // begin interop cudaD3D10SetDirect3DDevice(g_pd3dDevice); getLastCudaError("cudaD3D10SetDirect3DDevice failed"); return S_OK; }