void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); // execute the kernel for(int i=0; i<iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer( timer)); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); cutilSafeCall(cudaFree(d_result)); }
// keplereq_wrapper_C: // C wrapper function to solve's Kepler's equation num times. // inputs: // ph_ma: pointer to beginning element of array of doubles containing mean anomaly in radians // ph_ecc: pointer to beginning element of array of doubles containing eccentricity // num: integer size of input arrays // ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians // outputs: // ph_eccanom: values overwritten with eccentric anomaly // assumptions: // input mean anomalies between 0 and 2pi // input eccentricities between 0 and 1 // all three arrays have at least num elements // void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom) { int gpuid = init_cuda(); // put vectors in thrust format from raw points thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num); thrust::host_vector<double> h_ma(ph_ma,ph_ma+num); cutCreateTimer(&memoryTime); cutCreateTimer(&kernelTime); cutResetTimer(memoryTime); cutResetTimer(kernelTime); if(gpuid>=0) { cutStartTimer(memoryTime); // transfer input params to GPU thrust::device_vector<double> d_ecc = h_ecc; thrust::device_vector<double> d_ma = h_ma; // allocate mem on GPU thrust::device_vector<double> d_eccanom(num); cudaThreadSynchronize(); cutStopTimer(memoryTime); // distribute the computation to the GPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(), d_ecc.end(), d_eccanom.end())), keplereq_functor() ); cudaThreadSynchronize(); cutStopTimer(kernelTime); // transfer results back to host cutStartTimer(memoryTime); thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom); cudaThreadSynchronize(); cutStopTimer(memoryTime); } else { // distribute the computation to the CPU cutStartTimer(kernelTime); thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)), thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(), h_ecc.end(), ph_eccanom+num)), keplereq_functor() ); cutStopTimer(kernelTime); } }
// This is the normal display path void display(void) { cutilCheckError(cutStartTimer(timer)); // Sobel operation Pixel *data = NULL; // map PBO to get CUDA device pointer cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes, cuda_pbo_resource)); //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp ); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); glClear(GL_COLOR_BUFFER_BIT); glBindTexture(GL_TEXTURE_2D, texid); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDisable(GL_DEPTH_TEST); glEnable(GL_TEXTURE_2D); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glBegin(GL_QUADS); glVertex2f(0, 0); glTexCoord2f(0, 0); glVertex2f(0, 1); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(0, 1); glEnd(); glBindTexture(GL_TEXTURE_2D, 0); if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { printf("> (Frame %d) readback BackBuffer\n", frameCount); g_CheckRender->readback( imWidth, imHeight ); g_CheckRender->savePPM ( sOriginal_ppm[g_Index], true, NULL ); if (!g_CheckRender->PPMvsPPM(sOriginal_ppm[g_Index], sReference_ppm[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } glutSwapBuffers(); cutilCheckError(cutStopTimer(timer)); computeFPS(); glutPostRedisplay(); }
void fpsDisplay() { cutilCheckError(cutStartTimer(timer)); display(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
void cleanup() { cutilCheckError(cutStopTimer(timer)); cutilCheckError(cutDeleteTimer( timer)); cudaFree(a_d);cudaFree(b_d);cudaFree(r_d); cudaThreadExit(); }
// display results using OpenGL void display() { cutilCheckError(cutStartTimer(timer)); // execute filter, writing results to pbo unsigned int *d_result; //DEPRECATED: cutilSafeCall( cudaGLMapBufferObject((void**)&d_result, pbo) ); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_pbo_resource)); runSelect(d_result); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(pbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // fragment program is required to display floating point texture glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); glEnable(GL_FRAGMENT_PROGRAM_ARB); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); { glTexCoord2f(0, 0); glVertex2f(0, 0); glTexCoord2f(1, 0); glVertex2f(1, 0); glTexCoord2f(1, 1); glVertex2f(1, 1); glTexCoord2f(0, 1); glVertex2f(0, 1); } glEnd(); glBindTexture(GL_TEXTURE_TYPE, 0); glDisable(GL_FRAGMENT_PROGRAM_ARB); } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
void runBenchmark(int iterations) { cutilCheckError(cutStartTimer(timer)); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cutilCheckError(cutStopTimer(timer)); float milliseconds = cutGetTimerValue(timer); printf("%d particles, total time for %d iterations: %0.3f ms\n", numParticles, iterations, milliseconds); printf("Test PASSED\n"); }
// display results using OpenGL void display() { cutilCheckError(cutStartTimer(timer)); // execute filter, writing results to pbo unsigned int *d_result; cutilSafeCall(cudaGLMapBufferObject((void**)&d_result, pbo)); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall(cudaGLUnmapBufferObject(pbo)); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // display results glClear(GL_COLOR_BUFFER_BIT); glEnable(GL_TEXTURE_2D); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); glTexCoord2f(0, 1); glVertex2f(0, 0); glTexCoord2f(1, 1); glVertex2f(1, 0); glTexCoord2f(1, 0); glVertex2f(1, 1); glTexCoord2f(0, 0); glVertex2f(0, 1); glEnd(); glDisable(GL_TEXTURE_2D); if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { // readback for QA testing printf("> (Frame %d) Readback BackBuffer\n", frameCount); g_CheckRender->readback( width, height ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f )) { g_TotalErrors++; } g_Verify = false; } glutSwapBuffers(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
//////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// void display() { cutilCheckError(cutStartTimer(timer)); // run CUDA kernel to generate vertex positions runCuda(vbo); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // set view matrix glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0, 0.0, translate_z); glRotatef(rotate_x, 1.0, 0.0, 0.0); glRotatef(rotate_y, 0.0, 1.0, 0.0); // render from the vbo glBindBuffer(GL_ARRAY_BUFFER, vbo); glVertexPointer(4, GL_FLOAT, 0, 0); glEnableClientState(GL_VERTEX_ARRAY); glColor3f(1.0, 0.0, 0.0); glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); glDisableClientState(GL_VERTEX_ARRAY); if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { // readback for QA testing printf("> (Frame %d) Readback BackBuffer\n", frameCount); g_CheckRender->readback( window_width, window_height ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } else { printf( "TEST PASSED\n" ); } g_Verify = false; } glutSwapBuffers(); glutPostRedisplay(); anim += 0.01; cutilCheckError(cutStopTimer(timer)); computeFPS(); }
void cleanup() { free(a_h);free(b_h);free(r_h); free(control); cutilCheckError(cutStopTimer(timer)); cutilCheckError(cutDeleteTimer( timer)); cudaFree(a_d);cudaFree(b_d);cudaFree(r_d); cutilSafeCall(release()); checkCUDAError("release"); cudaThreadExit(); }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) m_nbody->update(activeParams.m_timestep); if (useCpu) { cutCreateTimer(&timer); cutStartTimer(timer); } else { cutilSafeCall(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { cutStopTimer(timer); milliseconds = cutGetTimerValue(timer); cutDeleteTimer(timer); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); shrLog("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); shrLog("= %.3f billion interactions per second\n", interactionsPerSecond); shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
void runAutoTest(int argc, char **argv) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); cutilSafeCall( cudaMalloc( (void**) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStartTimer( timer)); while (sigma <= 22) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); cutilSafeCall( cudaThreadSynchronize() ); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); cudaMemcpy(g_CheckRender->imageData(), d_result, width*height*4, cudaMemcpyDeviceToHost); g_CheckRender->savePPM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.50f)) { g_TotalErrors++; } g_Index++; sigma += 4; } cutilCheckError( cutStopTimer( timer)); printf("Processing time: %f (ms)\n", cutGetTimerValue( timer)); printf("%.2f Mpixels/sec\n", (width*height*g_Index / (cutGetTimerValue( timer) / 1000.0f)) / 1e6); printf("Summary: %d errors!\n", g_TotalErrors); printf("Test %s!\n", (g_TotalErrors==0) ? "PASSED" : "FAILED"); cutilSafeCall(cudaFree(d_result)); }
void disp(void){ glClear(GL_COLOR_BUFFER_BIT); update_phi(); its++; if(its<ITERATIONS){ glutPostRedisplay(); if(its%50==0){ printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer)); cutStartTimer(ReInitTimer); // ReInit Timer Start reinit_phi(); // ReInit glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi); glutSwapBuffers(); cutStopTimer(ReInitTimer); // ReInit Timer Stop } } else { printf("Iteration %3d Total Time: %3.2f ReInit Time: %3.2f\n", its, 0.001*cutGetTimerValue(Timer), 0.001*cutGetTimerValue(ReInitTimer)); glDrawPixels(imageW, imageH, GL_GREEN, GL_FLOAT, phi); glutSwapBuffers(); } }
void shutDown(unsigned char k, int /*x*/, int /*y*/) { switch (k){ case '\033': case 'q': case 'Q': printf("Shutting down...\n"); cutilCheckError( cutStopTimer(hTimer) ); cutilCheckError( cutDeleteTimer(hTimer) ); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(gl_PBO) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glDeleteBuffers(1, &gl_PBO); glDeleteTextures(1, &gl_Tex); cutilSafeCall( CUDA_FreeArray() ); free(h_Src); printf("Shutdown done.\n"); cutilDeviceReset(); exit(0); break; case '1': printf("Passthrough.\n"); g_Kernel = 0; break; case '2': printf("KNN method \n"); g_Kernel = 1; break; case '3': printf("NLM method\n"); g_Kernel = 2; break; case '4': printf("Quick NLM(NLM2) method\n"); g_Kernel = 3; break; case ' ': printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n"); g_Diag = !g_Diag; break; case 'n': printf("Decrease noise level.\n"); knnNoise -= noiseStep; nlmNoise -= noiseStep; break; case 'N': printf("Increase noise level.\n"); knnNoise += noiseStep; nlmNoise += noiseStep; break; case 'l': printf("Decrease LERP quotent.\n"); lerpC = MAX(lerpC - lerpStep, 0.0f); break; case 'L': printf("Increase LERP quotent.\n"); lerpC = MIN(lerpC + lerpStep, 1.0f); break; case 'f' : case 'F': g_FPS = true; break; case '?': printf("lerpC = %5.5f\n", lerpC); printf("knnNoise = %5.5f\n", knnNoise); printf("nlmNoise = %5.5f\n", nlmNoise); break; } }
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
// Host code int main(int argc, char** argv) { ParseArguments(argc, argv); float s_SobelMatrix[25]; s_SobelMatrix[0] = 1; s_SobelMatrix[1] = 2; s_SobelMatrix[2]= 0; s_SobelMatrix[3] = -2; s_SobelMatrix[4] = -1; s_SobelMatrix[5] = 4; s_SobelMatrix[6] = 8; s_SobelMatrix[7] = 0; s_SobelMatrix[8] = -8; s_SobelMatrix[9] = -4; s_SobelMatrix[10] = 6; s_SobelMatrix[11] = 12; s_SobelMatrix[12] = 0; s_SobelMatrix[13] = -12; s_SobelMatrix[14] = -6; s_SobelMatrix[15] = 4; s_SobelMatrix[16] = 8; s_SobelMatrix[17] = 0; s_SobelMatrix[18] = -8; s_SobelMatrix[19] =-4; s_SobelMatrix[20] =1; s_SobelMatrix[21] =2; s_SobelMatrix[22] =0; s_SobelMatrix[23] =-2; s_SobelMatrix[24] =-1; unsigned char *palete = NULL; unsigned char *data = NULL, *out = NULL; PPMImage *input_image=NULL, *output_image=NULL; output_image = (PPMImage *)malloc(sizeof(PPMImage)); input_image = readPPM(PPMInFileL); printf("Running %s filter\n", Filter); out = (unsigned char *)malloc(); printf("Computing the CPU output\n"); printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y); cutilCheckError(cutStartTimer(time_CPU)); if(FilterMode == SOBEL_FILTER){ printf("Running Sobel\n"); CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y); } else if(FilterMode == HIGH_BOOST_FILTER){ printf("Running boost\n"); CPU_Boost(data, out, dib.width, dib.height); } cutilCheckError(cutStopTimer(time_CPU)); if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5) BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete); else if(FilterMode == AVERAGE_FILTER) BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete); else if(FilterMode == HIGH_BOOST_FILTER) BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete); printf("Done with CPU output\n"); printf("CPU execution time %f \n", cutGetTimerValue(time_CPU)); printf("Allocating %d bytes for image \n", dib.image_size); cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) ); cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) ); // creating space for filter matrix cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) ); cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice); cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice); cutilCheckError(cutStopTimer(time_mem)); FilterWrapper(data, dib.width, dib.height); // Copy image back to host cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost); cutilCheckError(cutStopTimer(time_mem)); printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem)); printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem))); // Write output image BitMapWrite(BMPOutFile, &bmp, &dib, out, palete); Cleanup(); }
int main(int argc, char **argv) { GpuProfiling::initProf(); // Start logs shrSetLogFileName ("scan.txt"); shrLog("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Allocating and initializing CUDA arrays...\n"); cutilSafeCall( cudaMalloc((void **)&d_Input, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_Output, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice) ); shrLog("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize()); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } shrLog("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); for(int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; shrLog("Validating the results...\n"); shrLog("...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } } // pass or fail (cumulative... all tests in the loop) shrLog(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); GpuProfiling::printResults(); shrLog("Shutting down...\n"); closeScan(); cutilSafeCall( cudaFree(d_Output)); cutilSafeCall( cudaFree(d_Input)); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); exit(0); shrEXIT(argc, (const char**)argv); }
// display results using OpenGL void display() { cutilCheckError(cutStartTimer(timer)); // execute filter, writing results to pbo unsigned int *d_result; //DEPRECATED: cutilSafeCall( cudaGLMapBufferObject((void**)&d_result, pbo) ); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_pbo_resource)); boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius, iterations, nthreads); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(pbo)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT); // load texture from pbo glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, texid); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // fragment program is required to display floating point texture glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); glEnable(GL_FRAGMENT_PROGRAM_ARB); glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); if (GL_TEXTURE_TYPE == GL_TEXTURE_2D) { glTexCoord2f(0.0f, 0.0f); glVertex2f(0.0f, 0.0f); glTexCoord2f(1.0f, 0.0f); glVertex2f(1.0f, 0.0f); glTexCoord2f(1.0f, 1.0f); glVertex2f(1.0f, 1.0f); glTexCoord2f(0.0f, 1.0f); glVertex2f(0.0f, 1.0f); } else { glTexCoord2f(0.0f, 0.0f); glVertex2f(0.0f, 0.0f); glTexCoord2f((float)width, 0.0f); glVertex2f(1.0f, 0.0f); glTexCoord2f((float)width, (float)height); glVertex2f(1.0f, 1.0f); glTexCoord2f(0.0f, (float)height); glVertex2f(0.0f, 1.0f); } glEnd(); glBindTexture(GL_TEXTURE_TYPE, 0); glDisable(GL_FRAGMENT_PROGRAM_ARB); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { // readback for QA testing if (g_bFBODisplay) { shrLog("> (Frame %d) Readback FBO\n", frameCount); g_CheckRender->readback( width, height, g_FrameBufferObject->getFbo() ); } else { shrLog("> (Frame %d) Readback BackBuffer\n", frameCount); g_CheckRender->readback( width, height ); } g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //start logs shrSetLogFileName ("volumeRender.txt"); shrLog("%s Starting...\n\n", argv[0]); if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bQAGLVerify = true; fpsLimit = frameCheckNumber; } if (g_bQAReadback) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); } else { cudaSetDevice( cutGetMaxGflopsDeviceId() ); } } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); } /* int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); if( !strncmp( "Tesla", prop.name, 5 ) ) { shrLog("This sample needs a card capable of OpenGL and display.\n"); shrLog("Please choose a different device with the -device=x argument.\n"); cutilExit(argc, argv); } */ } // parse arguments char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) { volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) { volumeSize.width = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) { volumeSize.height = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) { volumeSize.depth = n; } // load volume data char* path = shrFindFilePath(volumeFilename, argv[0]); if (path == 0) { shrLog("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); free(h_volume); cutilCheckError( cutCreateTimer( &timer)); shrLog("Press '=' and '-' to change density\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); if (g_bQAReadback) { g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); uint *d_output; cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint))); cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); // Start timer 0 and process n loops on the GPU int nIter = 10; for (int i = -1; i < nIter; i++) { if( i == 0 ) { cudaThreadSynchronize(); cutStartTimer(timer); } render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaThreadSynchronize(); cutStopTimer(timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0); shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); cutilCheckMsg("Error: render_kernel() execution FAILED"); cutilSafeCall( cudaThreadSynchronize() ); cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { shrLog("\nFAILED\n\n"); } else { shrLog("\nPASSED\n\n"); } cudaFree(d_output); freeCudaBuffers(); if (g_CheckRender) { delete g_CheckRender; g_CheckRender = NULL; } } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); if (g_bQAGLVerify) { g_CheckRender = new CheckBackBuffer(width, height, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } atexit(cleanup); glutMainLoop(); } cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
void quickshift(image_t im, float sigma, float tau, float * map, float * gaps, float * E) { int verb = 1 ; float *M = 0, *n = 0; float tau2; int K, d; int N1,N2, i1,i2, j1,j2, R, tR; int medoid = 0 ; float const * I = im.I; N1 = im.N1; N2 = im.N2; K = im.K; d = 2 + K ; /* Total dimensions include spatial component (x,y) */ tau2 = tau*tau; if (medoid) { /* n and M are only used in mediod shift */ M = (float *) calloc(N1*N2*d, sizeof(float)) ; n = (float *) calloc(N1*N2, sizeof(float)) ; } R = (int) ceil (3 * sigma) ; tR = (int) ceil (tau) ; if (verb) { printf("quickshift: [N1,N2,K]: [%d,%d,%d]\n", N1,N2,K) ; printf("quickshift: type: %s\n", medoid ? "medoid" : "quick"); printf("quickshift: sigma: %g\n", sigma) ; /* R is ceil(3 * sigma) and determines the window size to accumulate * similarity */ printf("quickshift: R: %d\n", R) ; printf("quickshift: tau: %g\n", tau) ; printf("quickshift: tR: %d\n", tR) ; } /* ----------------------------------------------------------------- * n * -------------------------------------------------------------- */ /* If we are doing medoid shift, initialize n to the inner product of the * image with itself */ if (n) { for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { n [i1 + N1 * i2] = inner(I,N1,N2,K, i1,i2, i1,i2) ; } } } unsigned int Etimer; cutilCheckError( cutCreateTimer(&Etimer) ); cutilCheckError( cutResetTimer(Etimer) ); cutilCheckError( cutStartTimer(Etimer) ); /* ----------------------------------------------------------------- * E = - [oN'*F]', M * -------------------------------------------------------------- */ /* D_ij = d(x_i,x_j) E_ij = exp(- .5 * D_ij / sigma^2) ; F_ij = - E_ij E_i = sum_j E_ij M_di = sum_j X_j F_ij E is the parzen window estimate of the density 0 = dissimilar to everything, windowsize = identical */ for (i2 = 0 ; i2 < N2 ; ++ i2) { for (i1 = 0 ; i1 < N1 ; ++ i1) { float Ei = 0; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; /* For each pixel in the window compute the distance between it and the * source pixel */ for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; /* Make distance a similarity */ float Fij = exp(- Dij / (2*sigma*sigma)) ; /* E is E_i above */ Ei += Fij; if (M) { /* Accumulate votes for the median */ int k ; M [i1 + N1*i2 + (N1*N2) * 0] += j1 * Fij ; M [i1 + N1*i2 + (N1*N2) * 1] += j2 * Fij ; for (k = 0 ; k < K ; ++k) { M [i1 + N1*i2 + (N1*N2) * (k+2)] += I [j1 + N1*j2 + (N1*N2) * k] * Fij ; } } } /* j1 */ } /* j2 */ /* Normalize */ E [i1 + N1 * i2] = Ei / ((j1max-j1min)*(j2max-j2min)); /*E [i1 + N1 * i2] = Ei ; */ } /* i1 */ } /* i2 */ cutilCheckError( cutStopTimer(Etimer) ); float ETime = cutGetTimerValue(Etimer); printf("ComputeE: %fms\n", ETime); unsigned int Ntimer; cutilCheckError( cutCreateTimer(&Ntimer) ); cutilCheckError( cutResetTimer(Ntimer) ); cutilCheckError( cutStartTimer(Ntimer) ); /* ----------------------------------------------------------------- * Find best neighbors * -------------------------------------------------------------- */ if (medoid) { /* Qij = - nj Ei - 2 sum_k Gjk Mik n is I.^2 */ /* medoid shift */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float sc_best = 0 ; /* j1/j2 best are the best indicies for each i */ float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - R, 0 ) ; int j1max = VL_MIN(i1 + R, N1-1) ; int j2min = VL_MAX(i2 - R, 0 ) ; int j2max = VL_MIN(i2 + R, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { float Qij = - n [j1 + j2 * N1] * E [i1 + i2 * N1] ; int k ; Qij -= 2 * j1 * M [i1 + i2 * N1 + (N1*N2) * 0] ; Qij -= 2 * j2 * M [i1 + i2 * N1 + (N1*N2) * 1] ; for (k = 0 ; k < K ; ++k) { Qij -= 2 * I [j1 + j2 * N1 + (N1*N2) * k] * M [i1 + i2 * N1 + (N1*N2) * (k + 2)] ; } if (Qij > sc_best) { sc_best = Qij ; j1_best = j1 ; j2_best = j2 ; } } } /* map_i is the linear index of j which is the best pair (in matlab * notation * gaps_i is the score of the best match */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /*+ 1 ; */ gaps[i1 + N1 * i2] = sc_best ; } } } else { /* Quickshift assigns each i to the closest j which has an increase in the * density (E). If there is no j s.t. Ej > Ei, then gaps_i == inf (a root * node in one of the trees of merges). */ for (i2 = 0 ; i2 < N2 ; ++i2) { for (i1 = 0 ; i1 < N1 ; ++i1) { float E0 = E [i1 + N1 * i2] ; float d_best = INF ; float j1_best = i1 ; float j2_best = i2 ; int j1min = VL_MAX(i1 - tR, 0 ) ; int j1max = VL_MIN(i1 + tR, N1-1) ; int j2min = VL_MAX(i2 - tR, 0 ) ; int j2max = VL_MIN(i2 + tR, N2-1) ; for (j2 = j2min ; j2 <= j2max ; ++ j2) { for (j1 = j1min ; j1 <= j1max ; ++ j1) { if (E [j1 + N1 * j2] > E0) { float Dij = distance(I,N1,N2,K, i1,i2, j1,j2) ; if (Dij <= tau2 && Dij < d_best) { d_best = Dij ; j1_best = j1 ; j2_best = j2 ; } } } } /* map is the index of the best pair */ /* gaps_i is the minimal distance, inf implies no Ej > Ei within * distance tau from the point */ map [i1 + N1 * i2] = j1_best + N1 * j2_best ; /* + 1 ; */ if (map[i1 + N1 * i2] != i1 + N1 * i2) gaps[i1 + N1 * i2] = sqrt(d_best) ; else gaps[i1 + N1 * i2] = d_best; /* inf */ } } } if (M) free(M) ; if (n) free(n) ; cutilCheckError( cutStopTimer(Ntimer) ); float NTime = cutGetTimerValue(Ntimer); printf("ComputeN: %fms\n", NTime); }
void displayFunc(void){ cutStartTimer(hTimer); TColor *d_dst = NULL; size_t num_bytes; if(frameCounter++ == 0) cutResetTimer(hTimer); // DEPRECATED: cutilSafeCall(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); cutilCheckMsg("cudaGraphicsMapResources failed"); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &num_bytes, cuda_pbo_resource)); cutilCheckMsg("cudaGraphicsResourceGetMappedPointer failed"); cutilSafeCall( CUDA_Bind2TextureArray() ); runImageFilters(d_dst); cutilSafeCall( CUDA_UnbindTexture() ); // DEPRECATED: cutilSafeCall(cudaGLUnmapBufferObject(gl_PBO)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D( GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0) ); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); glBindTexture(GL_TEXTURE_2D, 0); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { printf("> (Frame %d) readback BackBuffer\n", frameCount); if (g_bFBODisplay) { g_CheckRender->readback( imageW, imageH, g_FrameBufferObject->getFbo() ); } else { g_CheckRender->readback( imageW, imageH ); } g_CheckRender->savePPM ( sOriginal[g_Kernel], true, NULL ); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Kernel], sReference[g_Kernel], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } if(frameCounter == frameN){ frameCounter = 0; if(g_FPS){ printf("FPS: %3.1f\n", frameN / (cutGetTimerValue(hTimer) * 0.001) ); g_FPS = false; } } glutSwapBuffers(); cutStopTimer(hTimer); computeFPS(); }
int main(int argc, char **argv) { char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } const int MAX_GPU_COUNT = 8; const int OPT_N = 256; const int PATH_N = 1 << 18; const unsigned int SEED = 777; //Input data array TOptionData optionData[OPT_N]; //Final GPU MC results TOptionValue callValueGPU[OPT_N]; //"Theoretical" call values by Black-Scholes formula float callValueBS[OPT_N]; //Solver config TOptionPlan optionSolver[MAX_GPU_COUNT]; //OS thread ID CUTThread threadID[MAX_GPU_COUNT]; //GPU number present in the system int GPU_N; int gpuBase, gpuIndex; int i; //Timer unsigned int hTimer; float time; double delta, ref, sumDelta, sumRef, sumReserve; cutilSafeCall( cudaGetDeviceCount(&GPU_N) ); cutilCheckError( cutCreateTimer(&hTimer) ); #ifdef _EMU GPU_N = 1; #endif printf("main(): generating input data...\n"); srand(123); for(i = 0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for(i = 0; i < GPU_N; i++) optionSolver[i].optionCount = OPT_N / GPU_N; //Take into account cases with "odd" option counts for(i = 0; i < (OPT_N % GPU_N); i++) optionSolver[i].optionCount++; //Assign GPU option ranges gpuBase = 0; for(i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } //Start the timer cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); //Start CPU thread for each GPU for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); //Stop the timer cutilCheckError( cutStopTimer(hTimer) ); time = cutGetTimerValue(hTimer); printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics\n"); for(i = 0; i < GPU_N; i++) { printf("GPU #%i\n", optionSolver[i].device); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } printf("\nTotal time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for(i = 0; i < OPT_N; i++) { BlackScholesCall( callValueBS[i], optionData[i] ); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta; #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n"); printf("Shutting down...\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cutilExit(argc, argv); }
void display() { cutilCheckError(cutStartTimer(timer)); // update the simulation if (!bPause) { psystem->setIterations(iterations); psystem->setDamping(damping); psystem->setGravity(-gravity); psystem->setCollideSpring(collideSpring); psystem->setCollideDamping(collideDamping); psystem->setCollideShear(collideShear); psystem->setCollideAttraction(collideAttraction); psystem->update(timestep); renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); } else { usleep(32666); } // render glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // cube glColor3f(1.0, 1.0, 1.0); glutWireCube(2.0); // collider glPushMatrix(); float4 p = psystem->getColliderPos(); glTranslatef(p.x, p.y, p.z); glColor3f(1.0, 0.0, 0.0); glutSolidSphere(psystem->getColliderRadius(), 20, 10); glPopMatrix(); if (displayEnabled) { renderer->display(displayMode); } if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } cutilCheckError(cutStopTimer(timer)); glutSwapBuffers(); fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } glutReportErrors(); }
//////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// void display() { cutilCheckError(cutStartTimer(timer)); // run CUDA kernel to generate geometry if (compute) { computeIsosurface(); } if (g_bFBODisplay) { g_FrameBufferObject->bindRenderPath(); } // Common display code path { glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // set view matrix glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(translate.x, translate.y, translate.z); glRotatef(rotate.x, 1.0, 0.0, 0.0); glRotatef(rotate.y, 0.0, 1.0, 0.0); glPolygonMode(GL_FRONT_AND_BACK, wireframe? GL_LINE : GL_FILL); if (lighting) { glEnable(GL_LIGHTING); } // render if (render) { glPushMatrix(); glRotatef(180.0, 0.0, 1.0, 0.0); glRotatef(90.0, 1.0, 0.0, 0.0); renderIsosurface(); glPopMatrix(); } glDisable(GL_LIGHTING); } if (g_bFBODisplay) { g_FrameBufferObject->unbindRenderPath(); // now rebind the texture and renderQuad // g_FrameBufferObject->renderQuad(width, height, GL_TEXTURE_TYPE); } if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { // readback for QA testing if (g_bFBODisplay) { printf("> (Frame %d) Readback FBO\n", frameCount); g_CheckRender->readback( window_width, window_height, g_FrameBufferObject->getFbo() ); } else { printf("> (Frame %d) Readback BackBuffer\n", frameCount); g_CheckRender->readback( window_width, window_height ); } g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Verify = false; } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
// main rendering loop void display() { cutilCheckError(cutStartTimer(timer)); if( !gestures.m_bPause ) { //Read next available data gestures.m_Context.WaitAndUpdateAll(); } //Process the data gestures.m_DepthGenerator.GetMetaData( depthMD ); gestures.m_UserGenerator.GetUserPixels( 0, sceneMD ); // move camera if (cameraPos[1] > 0.0f) cameraPos[1] = 0.0f; cameraPosLag += (cameraPos - cameraPosLag) * inertia; cameraRotLag += (cameraRot - cameraRotLag) * inertia; cursorPosLag += (cursorPos - cursorPosLag) * inertia; // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0); glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0); glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); // update the simulation if (!paused) { if (emitterOn) { runEmitter(); } SimParams &p = psystem->getParams(); p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); psystem->step(timestep); currentTime += timestep; } renderer->calcVectors(); vec3f sortVector = renderer->getSortVector(); psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z)); psystem->setModelView(modelView); psystem->setSorting(sort); psystem->depthSort(); // render glClearColor(0.0, 0.0, 0.0, 1.0); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); renderScene(); // draw particles if (displayEnabled) { // render scene to offscreen buffers to get correct occlusion renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER); renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER); renderScene(); renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER); renderer->setPositionBuffer(psystem->getPosBuffer()); renderer->setVelocityBuffer(psystem->getVelBuffer()); renderer->setIndexBuffer(psystem->getSortedIndexBuffer()); renderer->setNumParticles(psystem->getNumParticles()); renderer->setParticleRadius(spriteSize); renderer->setDisplayLightBuffer(displayLightBuffer); renderer->setAlpha(alpha); renderer->setShadowAlpha(shadowAlpha); renderer->setLightPosition(lightPos); renderer->setColorAttenuation(colorAttenuation); renderer->setLightColor(lightColor); renderer->setNumSlices(numSlices); renderer->setNumDisplayedSlices(numDisplayedSlices); renderer->setBlurRadius(blurRadius); renderer->render(); if (drawVectors) { renderer->debugVectors(); } } // display sliders if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); // readback for verification//sw/devrel/SDK10/Compute/projects/recursiveGaussian/recursiveGaussian.cpp if (g_CheckRender && g_CheckRender->IsQAReadback() && (++frameNumber >= frameCheckNumber)) { printf("> (Frame %d) Readback BackBuffer\n", frameNumber); g_CheckRender->readback( winWidth, winHeight ); g_CheckRender->savePPM(sOriginal, true, NULL); bool passed = g_CheckRender->PPMvsPPM(sOriginal, sReference, MAX_EPSILON_ERROR, THRESHOLD); printf("Summary: %d errors!\n", passed ? 0 : 1); printf("%s\n", passed ? "PASSED" : "FAILED"); cleanup(); exit(0); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float ifps = 1.f / (cutGetAverageTimerValue(timer) / 1000.f); sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (paused) fpsLimit = 0; cutilCheckError(cutResetTimer(timer)); } }
// display results using OpenGL (called by GLUT) void display() { cutilCheckError(cutStartTimer(timer)); // use OpenGL to build view matrix GLfloat modelView[16]; glMatrixMode(GL_MODELVIEW); glPushMatrix(); glLoadIdentity(); glRotatef(-viewRotation.x, 1.0, 0.0, 0.0); glRotatef(-viewRotation.y, 0.0, 1.0, 0.0); glTranslatef(-viewTranslation.x, -viewTranslation.y, -viewTranslation.z); glGetFloatv(GL_MODELVIEW_MATRIX, modelView); glPopMatrix(); invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; render(); // display results glClear(GL_COLOR_BUFFER_BIT); // draw image from PBO glDisable(GL_DEPTH_TEST); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); #if 0 // draw using glDrawPixels (slower) glRasterPos2i(0, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); #else // draw using texture // copy from pbo to texture glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_2D, tex); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); // draw textured quad glEnable(GL_TEXTURE_2D); glBegin(GL_QUADS); glTexCoord2f(0, 0); glVertex2f(0, 0); glTexCoord2f(1, 0); glVertex2f(1, 0); glTexCoord2f(1, 1); glVertex2f(1, 1); glTexCoord2f(0, 1); glVertex2f(0, 1); glEnd(); glDisable(GL_TEXTURE_2D); glBindTexture(GL_TEXTURE_2D, 0); #endif if (g_CheckRender && g_CheckRender->IsQAReadback() && g_Verify) { // readback for QA testing shrLog("\n> (Frame %d) Readback BackBuffer\n", frameCount); g_CheckRender->readback( width, height ); g_CheckRender->savePPM(sOriginal[g_Index], true, NULL); if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) { g_TotalErrors++; } g_Verify = false; } glutSwapBuffers(); glutReportErrors(); cutilCheckError(cutStopTimer(timer)); computeFPS(); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv){ uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint hTimer; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("Allocating and initializing host arrays...\n\n"); cutCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_SrcKey[i] = rand() % numValues; fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); cutilSafeCall( cudaMalloc((void **)&d_DstKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_DstVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_BufVal, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)) ); cutilSafeCall( cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)) ); cutilSafeCall( cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice) ); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutResetTimer(hTimer); cutStartTimer(hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); cutilSafeCall( cudaThreadSynchronize() ); cutStopTimer(hTimer); printf("Time: %f ms\n", cutGetTimerValue(hTimer)); printf("Reading back GPU merge sort results...\n"); cutilSafeCall( cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost) ); cutilSafeCall( cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost) ); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf( (keysFlag && valuesFlag) ? "TEST PASSED\n" : "TEST FAILED\n"); printf("Shutting down...\n"); closeMergeSort(); cutilCheckError( cutDeleteTimer(hTimer) ); cutilSafeCall( cudaFree(d_SrcVal) ); cutilSafeCall( cudaFree(d_SrcKey) ); cutilSafeCall( cudaFree(d_BufVal) ); cutilSafeCall( cudaFree(d_BufKey) ); cutilSafeCall( cudaFree(d_DstVal) ); cutilSafeCall( cudaFree(d_DstKey) ); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); cudaThreadExit(); cutilExit(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ const unsigned int OPT_N_MAX = 512; unsigned int useDoublePrecision; printf("[binomialOptions]\n"); int devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); cutilExit(argc, argv); exit(0); } cutilSafeCall(cudaGetDevice(&devID)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, devID)); char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) { useDoublePrecision = 0; } else { if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } printf(useDoublePrecision ? "Using double precision...\n" : "Using single precision...\n"); const int OPT_N = deviceEmulation() ? 1 : OPT_N_MAX; TOptionData optionData[OPT_N_MAX]; float callValueBS[OPT_N_MAX], callValueGPU[OPT_N_MAX], callValueCPU[OPT_N_MAX]; double sumDelta, sumRef, gpuTime, errorVal; unsigned int hTimer; int i; cutilCheckError( cutCreateTimer(&hTimer) ); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ printf("Double precision is not supported.\n"); return 0; } printf("Generating input data...\n"); //Generate options set srand(123); for(i = 0; i < OPT_N; i++){ optionData[i].S = randData(5.0f, 30.0f); optionData[i].X = randData(1.0f, 100.0f); optionData[i].T = randData(0.25f, 10.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; BlackScholesCall(callValueBS[i], optionData[i]); } printf("Running GPU binomial tree...\n"); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); if(useDoublePrecision) binomialOptions_SM13(callValueGPU, optionData, OPT_N); else binomialOptions_SM10(callValueGPU, optionData, OPT_N); cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutStopTimer(hTimer) ); gpuTime = cutGetTimerValue(hTimer); printf("Options count : %i \n", OPT_N); printf("Time steps : %i \n", NUM_STEPS); printf("binomialOptionsGPU() time: %f msec\n", gpuTime); printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); printf("Running CPU binomial tree...\n"); for(i = 0; i < OPT_N; i++) binomialOptionsCPU(callValueCPU[i], optionData[i]); printf("Comparing the results...\n"); sumDelta = 0; sumRef = 0; printf("GPU binomial vs. Black-Scholes\n"); for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i] - callValueGPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. Black-Scholes\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueBS[i]- callValueCPU[i]); sumRef += fabs(callValueBS[i]); } if(sumRef >1E-5) printf("L1 norm: %E\n", sumDelta / sumRef); else printf("Avg. diff: %E\n", sumDelta / (double)OPT_N); printf("CPU binomial vs. GPU binomial\n"); sumDelta = 0; sumRef = 0; for(i = 0; i < OPT_N; i++){ sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); sumRef += callValueCPU[i]; } if(sumRef > 1E-5) printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); else printf("Avg. diff: %E\n", errorVal = sumDelta / (double)OPT_N); printf("Shutting down...\n"); printf("\n[binomialOptions] - Test Summary:\n"); printf((errorVal < 5e-4) ? "PASSED\n" : "FAILED\n"); cutilCheckError( cutDeleteTimer(hTimer) ); cudaThreadExit(); cutilExit(argc, argv); }
T benchmarkReduceMax(int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, unsigned int timer, T* h_odata, T* d_idata, T* d_odata) { T gpu_result = 0; bool needReadBack = true; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; cutilDeviceSynchronize(); cutilCheckError( cutStartTimer( timer)); // execute the kernel maxreduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); // check if kernel execution generated an error cutilCheckMsg("Kernel execution failed"); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = whichKernel; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); maxreduce<T>(s, threads, blocks, kernel, d_odata, d_odata); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); } if (s > 1) { // copy result from device to host cutilSafeCallNoSync( cudaMemcpy( h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost) ); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } cutilDeviceSynchronize(); cutilCheckError( cutStopTimer(timer) ); } if (needReadBack) { // copy final sum from device to host cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) ); } return gpu_result; }