void switchDemoPrecision() { cutilDeviceSynchronize(); fp64 = !fp64; flopsPerInteraction = fp64 ? 30 : 20; T_old *oldPos = new T_old[numBodies * 4]; T_old *oldVel = new T_old[numBodies * 4]; NBodyDemo<T_old>::getArrays(oldPos, oldVel); // convert float to double T_new *newPos = new T_new[numBodies * 4]; T_new *newVel = new T_new[numBodies * 4]; for (int i = 0; i < numBodies * 4; i++) { newPos[i] = (T_new)oldPos[i]; newVel[i] = (T_new)oldVel[i]; } NBodyDemo<T_new>::setArrays(newPos, newVel); cutilDeviceSynchronize(); delete [] oldPos; delete [] oldVel; delete [] newPos; delete [] newVel; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// void runBenchmark( int argc, char **argv ) { int devID = 0; shrLog("[runBenchmark]: [%s]\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads); cutilSafeCall( cutilDeviceSynchronize() ); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); shrDeltaT(2); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads); } // check if kernel execution generated an error and sync host cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED"); cutilSafeCall(cutilDeviceSynchronize()); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); shrLog("\n"); }
void runAutoTest(int argc, char **argv) { int devID = 0; printf("[%s] - (automated testing w/ readback)\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) ); g_CheckRender = new CheckBackBuffer(imageW, imageH, sizeof(TColor), false); g_CheckRender->setExecPath(argv[0]); TColor *d_dst = NULL; cutilSafeCall( cudaMalloc( (void **)&d_dst, imageW*imageH*sizeof(TColor)) ); while (g_Kernel <= 3) { printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_dst, imageW*imageH*sizeof(TColor), cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[g_Kernel], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Kernel++; } cutilSafeCall( CUDA_FreeArray() ); free(h_Src); cutilSafeCall( cudaFree( d_dst ) ); delete g_CheckRender; printf("\n[%s] -> Test Results: %d errors\n", sSDKsample, g_TotalErrors); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED)); }
void runAutoTest(int argc, char **argv) { int devID = 0; shrLog("[runAutoTest]: [%s] (automated testing w/ readback)\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); for(int i = 0; i < 4; i++) { shrLog("[AutoTest]: %s (radius=%d)", sSDKsample, filter_radius ); bilateralFilterRGBA(d_result, width, height, euclidean_delta, filter_radius, iterations, nthreads); // check if kernel execution generated an error cutilCheckMsg("Error: bilateralFilterRGBA Kernel execution FAILED"); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[i], false, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[i], sReference[i], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } gaussian_delta += 1.0f; euclidean_delta *= 1.25f; updateGaussian(gaussian_delta, filter_radius); } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
int main() { int width = 512; int height = 512; // Creation du device cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) ); // Creation des buffers sur CPU int * a = new int[width*height]; int * b = new int[width*height]; int * res = new int[width*height]; for(int i = 0; i < width*height; i++) { a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); } // Allocation des objects on the device // *** data unsigned int size = width * height * sizeof(int); std::cout << "Allocation d'un buffer de taille " << width*height << "\n"; int* d_a = NULL; int* d_b = NULL; int* d_res = NULL; cutilSafeCall( cudaMalloc( (void**) &d_a, size)); cutilSafeCall( cudaMalloc( (void**) &d_b, size)); cutilSafeCall( cudaMalloc( (void**) &d_res, size)); // Copy des donnees cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)); cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice)); // Lancer le calcul std::cout << "Lancer le kernel ... \n"; runKernel(d_a, d_b, d_res, width*height); cutilSafeCall( cutilDeviceSynchronize() ); // Copie DeviceHost cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost)); // Verification du test int i = 0; for(;i < width*height;i++) if(res[i] != a[i] + b[i]) std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl; // Liberation des ressources // *** Device cutilSafeCall(cudaFree(d_a)); cutilSafeCall(cudaFree(d_b)); cutilSafeCall(cudaFree(d_res)); // *** CPU delete[] res; delete[] a; delete[] b; // Close device cutilDeviceReset(); std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) << std::endl; std::cout.flush(); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; shrQAStart(argc, argv); printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cutilDeviceSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cutilDeviceSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, (keysFlag && valuesFlag) ? QA_PASSED : QA_FAILED); }
void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device = cutilDeviceInit(argc, argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } checkDeviceMeetComputeSpec( argc, argv ); } else { int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaSetDevice( dev ); else { cutilDeviceReset(); shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } } loadDefaultImage( argc, argv ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) ); }
T benchmarkReduceMax(int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, unsigned int timer, T* h_odata, T* d_idata, T* d_odata) { T gpu_result = 0; bool needReadBack = true; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; cutilDeviceSynchronize(); cutilCheckError( cutStartTimer( timer)); // execute the kernel maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = whichKernel; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); } if (s > 1) { // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } cutilDeviceSynchronize(); cutilCheckError( cutStopTimer(timer) ); } if (needReadBack) { // copy final sum from device to host cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) ); } return gpu_result; }
int main() { int width = 800; int height = 600; int mesh_width = 256; int mesh_height = 256; // Creation du device cutilSafeCall( cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ) ); // Creation d'une fenetre C3::Window OpenGLWin; OpenGLWin.Create(C3::WindowMode(width,height),"CudaC3"); // Glew init GLenum err = glewInit(); if(err != GLEW_OK) std::cout << "Error on GLEW initialization.\n"; // Configuration OpenGL glClearColor(0.0, 0.0, 0.0, 1.0); glDisable(GL_DEPTH_TEST); glViewport(0, 0, width, height); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(60.0, (GLfloat)width / (GLfloat) height, 0.1, 10.0); // VBO // *** Create GLuint vbo; glGenBuffers(1, &vbo); glBindBuffer(GL_ARRAY_BUFFER, vbo); // *** Initialize unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // *** Register in CUDA cudaGraphicsResource *cuda_vbo_resource = NULL; cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); float g_fAnim = 0.f; int nbFrame = 0; float timeFPS = 0.f; while(OpenGLWin.IsOpened()) { // Events C3::Event event; while(OpenGLWin.PoolEvent(event)) { //std::cout << "Event !" << std::endl; if(event.Type == C3::Event::Closed) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } else if(event.Type == C3::Event::KeyPressed) { if(event.Key.Code == C3::Key::Escape) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } } } // Mise a jour du temps g_fAnim += OpenGLWin.GetFrameTime() / 1000.f; timeFPS += OpenGLWin.GetFrameTime() / 1000.f; nbFrame++; if(timeFPS > 1.0f) { std::stringstream ss; ss << "CudaC3 [" << (int)ceil( nbFrame / timeFPS ) << " FPS]"; OpenGLWin.SetTitle(ss.str()); timeFPS = 0.f; nbFrame = 0; } // Draw the scene glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT ); // Lancer le calcul CUDA // *** map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, cuda_vbo_resource)); // *** Run kernel runKernel(dptr, mesh_width, mesh_height,g_fAnim); cutilSafeCall( cutilDeviceSynchronize() ); // *** Unmap cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); // OpenGL // *** Make some transformation glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0, 0.0, -3.0); glRotatef(0.0, 1.0, 0.0, 0.0); glRotatef(0.0, 0.0, 1.0, 0.0); // *** Render VBO // --- Bind glBindBuffer(GL_ARRAY_BUFFER, vbo); glVertexPointer(4, GL_FLOAT, 0, 0); // --- Draw glEnableClientState(GL_VERTEX_ARRAY); glColor3f(1.0, 0.0, 0.0); glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); glDisableClientState(GL_VERTEX_ARRAY); // Swap buffers OpenGLWin.Display(); } // Liberation des ressources cudaGraphicsUnregisterResource(cuda_vbo_resource); glBindBuffer(1, vbo); glDeleteBuffers(1, &vbo); // Close device cutilDeviceReset(); return 0; }