// Main program int main(int argc, char** argv) { // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); if (CUTFalse == initGL(argc, argv)) { return CUTFalse; } initCuda(argc, argv); CUT_CHECK_ERROR_GL(); // register callbacks glutDisplayFunc(fpsDisplay); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); // start rendering mainloop glutMainLoop(); // clean up cudaThreadExit(); cutilExit(argc, argv); }
// keplereq_wrapper_C: // C wrapper function to solve's Kepler's equation num times. // inputs: // ph_ma: pointer to beginning element of array of doubles containing mean anomaly in radians // ph_ecc: pointer to beginning element of array of doubles containing eccentricity // num: integer size of input arrays // ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians // outputs: // ph_eccanom: values overwritten with eccentric anomaly // assumptions: // input mean anomalies between 0 and 2pi // input eccentricities between 0 and 1 // all three arrays have at least num elements // void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom) { int gpuid = init_cuda(); // put vectors in thrust format from raw points thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num); thrust::host_vector<double> h_ma(ph_ma,ph_ma+num); cutCreateTimer(&memoryTime); cutCreateTimer(&kernelTime); cutResetTimer(memoryTime); cutResetTimer(kernelTime); if(gpuid>=0) { cutStartTimer(memoryTime); // transfer input params to GPU thrust::device_vector<double> d_ecc = h_ecc; thrust::device_vector<double> d_ma = h_ma; // allocate mem on GPU thrust::device_vector<double> d_eccanom(num); cudaThreadSynchronize(); cutStopTimer(memoryTime); // distribute the computation to the GPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(), d_ecc.end(), d_eccanom.end())), keplereq_functor() ); cudaThreadSynchronize(); cutStopTimer(kernelTime); // transfer results back to host cutStartTimer(memoryTime); thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom); cudaThreadSynchronize(); cutStopTimer(memoryTime); } else { // distribute the computation to the CPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)), thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(), h_ecc.end(), ph_eccanom+num)), keplereq_functor() ); cutStopTimer(kernelTime); } }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// CUTBoolean runTest(int argc, char** argv) { if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noqatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (CUTFalse == initGL(argc, argv)) { return CUTFalse; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilGLDeviceInit(argc, argv); else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // create VBO createVBO(&vbo); // run the cuda part runCuda(vbo); // check result of Cuda step checkResultCuda(argc, argv, vbo); atexit(cleanup); // start rendering mainloop glutMainLoop(); cudaThreadExit(); return CUTTrue; }
void initCuda() { // allocate device memory cutilSafeCall( cudaMalloc( (void**) &d_img, (width * height * sizeof(unsigned int)) )); cutilSafeCall( cudaMalloc( (void**) &d_temp, (width * height * sizeof(unsigned int)) )); // Refer to boxFilter_kernel.cu for implementation initTexture(width, height, h_img); cutilCheckError( cutCreateTimer( &timer)); }
void init(int numParticles, uint3 gridSize) { psystem = new ParticleSystem(numParticles, gridSize); psystem->reset(ParticleSystem::CONFIG_GRID); renderer = new ParticleRenderer; renderer->setParticleRadius(psystem->getParticleRadius()); renderer->setColorBuffer(psystem->getColorBuffer()); cutilCheckError(cutCreateTimer(&timer)); }
void _init(int numBodies, int numDevices, int p, int q, bool bUsePBO, bool useHostMem, bool useCpu) { if (useCpu) { m_nbodyCpu = new BodySystemCPU<T>(numBodies); m_nbody = m_nbodyCpu; m_nbodyCuda = 0; } else { m_nbodyCuda = new BodySystemCUDA<T>(numBodies, numDevices, p, q, bUsePBO, useHostMem); m_nbody = m_nbodyCuda; m_nbodyCpu = 0; } // allocate host memory m_hPos = new T[numBodies*4]; m_hVel = new T[numBodies*4]; m_hColor = new float[numBodies*4]; m_nbody->setSoftening(activeParams.m_softening); m_nbody->setDamping(activeParams.m_damping); if (useCpu) { cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutStartTimer(timer)); } else { cutilSafeCall( cudaEventCreate(&startEvent) ); cutilSafeCall( cudaEventCreate(&stopEvent) ); cutilSafeCall( cudaEventCreate(&hostMemSyncEvent) ); } if (!benchmark && !compareToCPU) { m_renderer = new ParticleRenderer; _resetRenderer(); } cutilCheckError(cutCreateTimer(&demoTimer)); cutilCheckError(cutStartTimer(demoTimer)); }
void initCudaBuffers() { unsigned int size = width * height * sizeof(unsigned int); // allocate device memory cutilSafeCall( cudaMalloc( (void**) &d_img, size)); cutilSafeCall( cudaMalloc( (void**) &d_temp, size)); cutilSafeCall( cudaMemcpy( d_img, h_img, size, cudaMemcpyHostToDevice)); cutilCheckError( cutCreateTimer( &timer)); }
void initCuda() { // allocate device memory cutilSafeCall( cudaMalloc( (void**) &d_img, (width * height * sizeof(unsigned int)) )); //initialize gaussian mask //updateGaussianGold(gaussian_delta, filter_radius); updateGaussian(gaussian_delta, filter_radius); initTexture(width, height, h_img); cutilCheckError( cutCreateTimer( &timer)); }
// initialize particle system void initParticles(int numParticles, bool bUseVBO, bool bUseGL) { psystem = new ParticleSystem(numParticles, bUseVBO, bUseGL); psystem->reset(ParticleSystem::CONFIG_RANDOM); if (bUseVBO) { renderer = new SmokeRenderer(numParticles); renderer->setLightTarget(vec3f(0.0, 1.0, 0.0)); cutilCheckError(cutCreateTimer(&timer)); } }
void runGraphicsTest(int argc, char** argv) { printf("MarchingCubes "); if (g_bFBODisplay) printf("[w/ FBO] "); if (g_bOpenGLQA) printf("[Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL(&argc, argv)) { return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutIdleFunc(idle); glutReshapeFunc(reshape); initMenus(); // Initialize CUDA buffers for Marching Cubes initMC(argc, argv); cutilCheckError( cutCreateTimer( &timer)); if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
bool initCUDA(void) { #if __DEVICE_EMULATION__ return true; #else int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "Nu exista nici un device.\n"); return false; } printf("Exista %d device-uri.\n",count); for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } if(!prop.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n",0); exit(EXIT_FAILURE); } } if(i == count) { fprintf(stderr, "Nu exista nici un device care suporta CUDA.\n"); return false; } cudaSetDevice(cutGetMaxGflopsDeviceId()); cudaSetDeviceFlags(cudaDeviceMapHost); checkCUDAError("cudaSetDeviceFlags"); printf("CUDA initializat cu succes\n"); // Create the CUTIL timer cutilCheckError( cutCreateTimer( &timer)); return true; #endif }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) m_nbody->update(activeParams.m_timestep); if (useCpu) { cutCreateTimer(&timer); cutStartTimer(timer); } else { cutilSafeCall(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { cutStopTimer(timer); milliseconds = cutGetTimerValue(timer); cutDeleteTimer(timer); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); shrLog("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); shrLog("= %.3f billion interactions per second\n", interactionsPerSecond); shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runAutoTest(int argc, char** argv) { printf("[%s]\n", sSDKsample); // Cuda init int dev = cutilChooseCudaDevice(argc, argv); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev)); printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); int version = deviceProp.major*10 + deviceProp.minor; g_hasDouble = (version >= 13); if (inEmulationMode()) { // workaround since SM13 kernel doesn't produce correct output in emulation mode g_hasDouble = false; } // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // Creating the Auto-Validation Code g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runCudaTest(g_hasDouble); cudaThreadExit(); }
void init(int numBodies, int p, int q, bool bUsePBO) { nbodyCUDA = new BodySystemCUDA(numBodies, p, q, bUsePBO); nbody = nbodyCUDA; // allocate host memory hPos = new float[numBodies*4]; hVel = new float[numBodies*4]; hColor = new float[numBodies*4]; nbody->setSoftening(activeParams.m_softening); nbody->setDamping(activeParams.m_damping); //cutilCheckError(cutCreateTimer(&timer)); cutilSafeCall( cudaEventCreate(&startEvent) ); cutilSafeCall( cudaEventCreate(&stopEvent) ); cutilCheckError(cutCreateTimer(&demoTimer)); cutilCheckError(cutStartTimer(demoTimer)); }
int main(int argc, char** argv){ const char *image_path = IMAGE; //TODO : declare ALL variables here LoadBMPFile(&h_Src, &imageW, &imageH, image_path); D = (float *)malloc(imageW*imageH*sizeof(float)); //printf("Input Image\n"); for(r=0;r<imageH;r++){ for(c=0;c<imageW;c++){ D[r*imageW+c] = h_Src[r*imageW+c].x; /*printf("%3.0f ", D[r*imageW+c]);*/ } //printf("\n"); } N = imageW*imageH; for(i=0;i<N;i++){ D[i] = EPSILON - abs(D[i] - THRESHOLD); } //printf("Speed Function\n"); //for(int r=0;r<imageH;r++){ // for(int c=0;c<imageW;c++){ // printf("%3.0f ", D[r*imageW+c]); // } // printf("\n"); //} // Set up CUDA Timer cutCreateTimer(&Timer); cutCreateTimer(&ReInitTimer); cutStartTimer(Timer); init_phi(); if((contour=(float *)malloc(imageW*imageH*sizeof(float)))==NULL)printf("Contour\n"); if((phi1=(float *)malloc(imageW*imageH*sizeof(float)))==NULL)printf("GRADPHI\n"); //update_phi(); // GL initialisation glutInit(&argc, argv); glutInitDisplayMode(GLUT_ALPHA | GLUT_DOUBLE); glutInitWindowSize(imageW,imageH); glutInitWindowPosition(100,100); glutCreateWindow("GL Level Set Evolution"); glClearColor(0.0,0.0,0.0,0.0); glutDisplayFunc(disp); glutMainLoop(); //printf("phi+1\n"); //for(int r=0;r<imageH;r++){ // for(int c=0;c<imageW;c++){ // printf("%6.3f ", phi[r*imageW+c]); // } // printf("\n"); //} }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runGraphicsTest(int argc, char** argv) { printf("[%s] ", sSDKsample); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL( &argc, argv )) { cudaThreadExit(); return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // create vertex buffers and register with CUDA createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); // create vertex and index buffer for mesh createMeshPositionVBO(&posVertexBuffer, meshW, meshH); createMeshIndexBuffer(&indexBuffer, meshW, meshH); // Creating the Auto-Validation Code if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(windowH, windowH, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } runCuda(); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
int main(int argc, char** argv) { printf("[%s]\n", sSDKsample); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device if possible, otherwise search for capable device if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); int device; cudaGetDevice( &device ); if( checkCUDAProfile( device ) == false ) { cudaThreadExit(); cutilExit(argc, argv); } } else { //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaGLSetGLDevice( dev ); else { cudaThreadExit(); cutilExit(argc, argv); } } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: swtich point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char **argv) { GpuProfiling::initProf(); // Start logs shrSetLogFileName ("scan.txt"); shrLog("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Allocating and initializing CUDA arrays...\n"); cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) ); shrLog("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize()); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } // pass or fail (cumulative... all tests in the loop) shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); GpuProfiling::printResults(); shrLog("Shutting down...\n"); closeScan(); cutilSafeCall( cudaFree(d_Output)); cutilSafeCall( cudaFree(d_Input)); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); exit(0); shrEXIT(argc, (const char**)argv); }
void quickshift(image_t im, float sigma, float tau, float * map, float * gaps, float * E) { int verb = 1 ; float *M = 0, *n = 0; float tau2; int K, d; int N1,N2, i1,i2, j1,j2, R, tR; int medoid = 0 ; float const * I = im.I; N1 = im.N1; N2 = im.N2; K = im.K; d = 2 + K ; /* Total dimensions include spatial component (x,y) */ tau2 = tau*tau; if (medoid) { /* n and M are only used in mediod shift */ M = (float *) calloc(N1*N2*d, sizeof(float)) ; n = (float *) calloc(N1*N2, sizeof(float)) ; } R = (int) ceil (3 * sigma) ; tR = (int) ceil (tau) ; if (verb) { printf("quickshift: [N1,N2,K]: [%d,%d,%d]\n", N1,N2,K) ; printf("quickshift: type: %s\n", medoid ? "medoid" : "quick"); printf("quickshift: sigma: %g\n", sigma) ; /* R is ceil(3 * sigma) and determines the window size to accumulate * similarity */ printf("quickshift: R: %d\n", R) ; printf("quickshift: tau: %g\n", tau) ; printf("quickshift: tR: %d\n", tR) ; } /* ----------------------------------------------------------------- * n * -------------------------------------------------------------- */ /* If we are doing medoid shift, initialize n to the inner product of the * image with itself */ if (n) { for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { n [i1 + N1 * i2] = inner(I,N1,N2,K, i1,i2, i1,i2) ; } } } unsigned int Etimer; cutilCheckError( cutCreateTimer(&Etimer) ); cutilCheckError( cutResetTimer(Etimer) ); cutilCheckError( cutStartTimer(Etimer) ); /* ----------------------------------------------------------------- * E = - [oN'*F]', M * -------------------------------------------------------------- */ /* D_ij = d(x_i,x_j) E_ij = exp(- .5 * D_ij / sigma^2) ; F_ij = - E_ij E_i = sum_j E_ij M_di = sum_j X_j F_ij E is the parzen window estimate of the density 0 = dissimilar to everything, windowsize = identical */ for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { float Ei = 0; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; /* For each pixel in the window compute the distance between it and the * source pixel */ for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; /* Make distance a similarity */ float Fij = exp(- Dij / (2*sigma*sigma)) ; /* E is E_i above */ Ei += Fij; if (M) { /* Accumulate votes for the median */ int k ; M [i1 + N1*i2 + (N1*N2) * 0] += j1 * Fij ; M [i1 + N1*i2 + (N1*N2) * 1] += j2 * Fij ; for (k = 0 ; k < K ; ++k) { M [i1 + N1*i2 + (N1*N2) * (k+2)] += I [j1 + N1*j2 + (N1*N2) * k] * Fij ; } } } /* j1 */ } /* j2 */ /* Normalize */ E [i1 + N1 * i2] = Ei / ((j1max-j1min)*(j2max-j2min)); /*E [i1 + N1 * i2] = Ei ; */ } /* i1 */ } /* i2 */ cutilCheckError( cutStopTimer(Etimer) ); float ETime = cutGetTimerValue(Etimer); printf("ComputeE: %fms\n", ETime); unsigned int Ntimer; cutilCheckError( cutCreateTimer(&Ntimer) ); cutilCheckError( cutResetTimer(Ntimer) ); cutilCheckError( cutStartTimer(Ntimer) ); /* ----------------------------------------------------------------- * Find best neighbors * -------------------------------------------------------------- */ if (medoid) { /* Qij = - nj Ei - 2 sum_k Gjk Mik n is I.^2 */ /* medoid shift */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float sc_best = 0 ; /* j1/j2 best are the best indicies for each i */ float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Qij = - n [j1 + j2 * N1] * E [i1 + i2 * N1] ; int k ; Qij -= 2 * j1 * M [i1 + i2 * N1 + (N1*N2) * 0] ; Qij -= 2 * j2 * M [i1 + i2 * N1 + (N1*N2) * 1] ; for (k = 0 ; k < K ; ++k) { Qij -= 2 * I [j1 + j2 * N1 + (N1*N2) * k] * M [i1 + i2 * N1 + (N1*N2) * (k + 2)] ; } if (Qij > sc_best) { sc_best = Qij ; j1_best = j1 ; j2_best = j2 ; } } } /* map_i is the linear index of j which is the best pair (in matlab * notation * gaps_i is the score of the best match */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /*+ 1 ; */ gaps[i1 + N1 * i2] = sc_best ; } } } else { /* Quickshift assigns each i to the closest j which has an increase in the * density (E). If there is no j s.t. Ej > Ei, then gaps_i == inf (a root * node in one of the trees of merges). */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float E0 = E [i1 + N1 * i2] ; float d_best = INF ; float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - tR, 0 ) ; int j1max = VL_MIN(i1 + tR, N1-1) ; int j2min = VL_MAX(i2 - tR, 0 ) ; int j2max = VL_MIN(i2 + tR, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { if (E [j1 + N1 * j2] > E0) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; if (Dij <= tau2 && Dij < d_best) { d_best = Dij ; j1_best = j1 ; j2_best = j2 ; } } } } /* map is the index of the best pair */ /* gaps_i is the minimal distance, inf implies no Ej > Ei within * distance tau from the point */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /* + 1 ; */ if (map[i1 + N1 * i2] != i1 + N1 * i2) gaps[i1 + N1 * i2] = sqrt(d_best) ; else gaps[i1 + N1 * i2] = d_best; /* inf */ } } } if (M) free(M) ; if (n) free(n) ; cutilCheckError( cutStopTimer(Ntimer) ); float NTime = cutGetTimerValue(Ntimer); printf("ComputeN: %fms\n", NTime); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n"); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -qatest\n\n", argv[0]); printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); //cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); int dev = findCapableDevice(argc, argv); if( dev != -1 ) { cudaGLSetGLDevice( dev ); } else { shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); if (g_bOpenGLQA) { loadDefaultImage( argc, argv ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (mean/Sobel)\n"); printf("p: switch point filter operation (threshold on/off)\n"); fflush(stdout); atexit(cleanup); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); } } cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n"); exit(1); // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T*) malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up for (int kernel = 0; kernel < 7; kernel++) { sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); // print headers shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { cutResetTimer(timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = cutGetAverageTimerValue(timer); } else { reduceTime = -1.0; } shrLog(", %.5f", reduceTime); } } // cleanup cutilCheckError(cutDeleteTimer(timer)); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); }
int main(int argc, char **argv) { shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; g_bFBODisplay = false; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { printf("[%s] ", sSDKsample); if (g_bFBODisplay) printf("[FBO Display] "); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) ); initOpenGLBuffers(); // Creating the Auto-Validation Code if (g_bOpenGLQA) { if (g_bFBODisplay) { g_CheckRender = new CheckFBO(imageW, imageH, 4); } else { g_CheckRender = new CheckBackBuffer(imageW, imageH, 4); } g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(g_bOpenGLQA); } } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [ ] to view smooth/edgy areas [RED/BLUE] Ct's\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); glutDisplayFunc(displayFunc); glutKeyboardFunc(shutDown); cutilCheckError( cutCreateTimer(&hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
bool runTestMax( int argc, char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog("METHOD: MAX\n"); shrLog("%d elements\n", size); shrLog("%d threads (max)\n", maxThreads); cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue); cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate mem for the result on host side T* h_odata = (T*) malloc(numBlocks*sizeof(T)); shrLog("%d blocks\n\n", numBlocks); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); T gpu_result = 0; gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = cutGetAverageTimerValue(timer) * 1e-3; shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = maxreduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; if (datatype == REDUCE_INT) { shrLog("\nGPU result = %d\n", gpu_result); shrLog("CPU result = %d\n\n", cpu_result); } else { shrLog("\nGPU result = %f\n", gpu_result); shrLog("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) threshold = 1e-8 * size; diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup cutilCheckError( cutDeleteTimer(timer) ); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
int main(int argc, char **argv) { char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } const int MAX_GPU_COUNT = 8; const int OPT_N = 256; const int PATH_N = 1 << 18; const unsigned int SEED = 777; //Input data array TOptionData optionData[OPT_N]; //Final GPU MC results TOptionValue callValueGPU[OPT_N]; //"Theoretical" call values by Black-Scholes formula float callValueBS[OPT_N]; //Solver config TOptionPlan optionSolver[MAX_GPU_COUNT]; //OS thread ID CUTThread threadID[MAX_GPU_COUNT]; //GPU number present in the system int GPU_N; int gpuBase, gpuIndex; int i; //Timer unsigned int hTimer; float time; double delta, ref, sumDelta, sumRef, sumReserve; cutilSafeCall( cudaGetDeviceCount(&GPU_N) ); cutilCheckError( cutCreateTimer(&hTimer) ); #ifdef _EMU GPU_N = 1; #endif printf("main(): generating input data...\n"); srand(123); for(i = 0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for(i = 0; i < GPU_N; i++) optionSolver[i].optionCount = OPT_N / GPU_N; //Take into account cases with "odd" option counts for(i = 0; i < (OPT_N % GPU_N); i++) optionSolver[i].optionCount++; //Assign GPU option ranges gpuBase = 0; for(i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } //Start the timer cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); //Start CPU thread for each GPU for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); //Stop the timer cutilCheckError( cutStopTimer(hTimer) ); time = cutGetTimerValue(hTimer); printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics\n"); for(i = 0; i < GPU_N; i++) { printf("GPU #%i\n", optionSolver[i].device); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } printf("\nTotal time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for(i = 0; i < OPT_N; i++) { BlackScholesCall( callValueBS[i], optionData[i] ); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta; #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n"); printf("Shutting down...\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
int main(int argc, char** argv) { // EDISON ////////////////////////////////////////////////////////////////// sigmaS = 7.0f; sigmaR = 6.5f; edison.minRegion = 20.0f; cutLoadPPMub("image.ppm", &edison.inputImage_, &width, &height); edison.meanShift(); cutSavePPMub("segmimage.ppm", edison.segmImage_, width, height); cutSavePPMub("filtimage.ppm", edison.filtImage_, width, height); unsigned char data[height * width]; memset(data, 0, height * width * sizeof(unsigned char)); for(int i = 0; i < edison.numBoundaries_; i++) { data[edison.boundaries_[i]] = 255; } cutSavePGMub("bndyimage.pgm", data, width, height); //return 0; // EDISON ////////////////////////////////////////////////////////////////// if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); } if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice (cutGetMaxGflopsDeviceId() ); } int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if(!strncmp( "Tesla", prop.name, 5 )) { printf("This sample needs a card capable of OpenGL and display.\n"); printf("Please choose a different device with the -device=x argument.\n"); cudaThreadExit(); cutilExit(argc, argv); } cutilCheckError(cutCreateTimer(&timer)); cutilCheckError(cutResetTimer(timer)); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutIdleFunc(idle); if (g_bOpenGLQA) { loadDefaultImage( argv[0] ); } if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename); } } else { loadDefaultImage( argv[0]); } // If code is not printing the USage, then we execute this path. if (!bQuit) { if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(wWidth, wHeight, 4); g_CheckRender->setPixelFormat(GL_BGRA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } printf("I: display image\n"); printf("T: display Sobel edge detection (computed with tex)\n"); printf("S: display Sobel edge detection (computed with tex+shared memory)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); fflush(stdout); atexit(cleanup); glutMainLoop(); } } cudaThreadExit(); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ const unsigned int OPT_N_MAX = 512; unsigned int useDoublePrecision; printf("[binomialOptions]\n"); int devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); cutilExit(argc, argv); exit(0); } cutilSafeCall(cudaGetDevice(&devID)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) { useDoublePrecision = 0; } else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n"); const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX; TOptionData optionData[OPT_N_MAX]; float callValueBS[OPT_N_MAX], callValueGPU[OPT_N_MAX], callValueCPU[OPT_N_MAX]; double sumDelta, sumRef, gpuTime, errorVal; unsigned int hTimer; int i; cutilCheckError( cutCreateTimer(&hTimer) ); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ printf("Double precision is not supported.\n"); return 0; } printf("Generating input data...\n"); //Generate options set srand(123); for(i = 0; i < OPT_N; i++){ optionData[i].S = randData(5.0f, 30.0f); optionData[i].X = randData(1.0f, 100.0f); optionData[i].T = randData(0.25f, 10.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; BlackScholesCall(callValueBS[i], optionData[i]); } printf("Running GPU binomial tree...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); if(useDoublePrecision) binomialOptions_SM13(callValueGPU, optionData, OPT_N); else binomialOptions_SM10(callValueGPU, optionData, OPT_N); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer(hTimer) ); gpuTime = cutGetTimerValue(hTimer); printf("Options count : %i \n", OPT_N); printf("Time steps : %i \n", NUM_STEPS); printf("binomialOptionsGPU() time: %f msec\n", gpuTime); printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); printf("Running CPU binomial tree...\n"); for(i = 0; i < OPT_N; i++) binomialOptionsCPU(callValueCPU[i], optionData[i]); printf("Comparing the results...\n"); sumDelta = 0; sumRef = 0; printf("GPU binomial vs. Black-Scholes\n"); for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i] - callValueGPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. Black-Scholes\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i]- callValueCPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. GPU binomial\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); sumRef += callValueCPU[i]; } if(sumRef > 1E-5) printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); else printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N); printf("Shutting down...\n"); printf("\n[binomialOptions] - Test Summary:\n"); printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); cutilExit(argc, argv); }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }