//***************************************************************************** bool CompareResults(int numBodies) { // Run computation on the device/GPU shrLog(" Computing on the Device / GPU...\n"); nbodyGPU->update(0.001f); nbodyGPU->synchronizeThreads(); // Write out device/GPU data file for regression analysis shrLog(" Writing out Device/GPU data file for analysis...\n"); float* fGPUData = nbodyGPU->getArray(BodySystem::BODYSYSTEM_POSITION); shrWriteFilef( "oclNbody_Regression.dat", fGPUData, numBodies, 0.0, false); // Run computation on the host CPU shrLog(" Computing on the Host / CPU...\n\n"); BodySystemCPU* nbodyCPU = new BodySystemCPU(numBodies); nbodyCPU->setArray(BodySystem::BODYSYSTEM_POSITION, hPos); nbodyCPU->setArray(BodySystem::BODYSYSTEM_VELOCITY, hVel); nbodyCPU->update(0.001f); // Check if result matches shrBOOL bMatch = shrComparefe(fGPUData, nbodyGPU->getArray(BodySystem::BODYSYSTEM_POSITION), numBodies, .001f); shrLog("Results %s\n\n", (shrTRUE == bMatch) ? "Match" : "do not match!"); // Cleanup local allocation if(nbodyCPU)delete nbodyCPU; return (shrTRUE == bMatch); }
extern "C" void initHistogram64(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("...loading Histogram64.cl from file\n"); char *cHistogram64 = oclLoadProgSource(shrFindFilePath("Histogram64.cl", argv[0]), "// My comment\n", &kernelLength); shrCheckError(cHistogram64 != NULL, shrTRUE); shrLog("...creating histogram64 program\n"); cpHistogram64 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cHistogram64, &kernelLength, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...building histogram64 program\n"); ciErrNum = clBuildProgram(cpHistogram64, 0, NULL, compileOptions, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...creating histogram64 kernels\n"); ckHistogram64 = clCreateKernel(cpHistogram64, "histogram64", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); ckMergeHistogram64 = clCreateKernel(cpHistogram64, "mergeHistogram64", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("...allocating internal histogram64 buffer\n"); d_PartialHistograms = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint), NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); //Save default command queue cqDefaultCommandQue = cqParamCommandQue; //Discard temp storage free(cHistogram64); //Save ptx code to separate file oclLogPtx(cpHistogram64, oclGetFirstDev(cxGPUContext), "Histogram64.ptx"); }
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol) { shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol); int i,j,k; int error_count=0; for (j = 0; j < height; j++) { if (error_count < iListLength) { shrLog("\n Row %d:\n", j); } for (i = 0; i < width; i++) { k = j * width + i; float fDiff = fabs(data1[k] - data2[k]); if (fDiff > fListTol) { if (error_count < iListLength) { shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff); } error_count++; } } } shrLog(" \n Total Errors = %d\n\n", error_count); }
// assumes the values were initially indices into the array, for simplicity of // checking correct order of values bool verifySortUint(unsigned int *keysSorted, unsigned int *valuesSorted, unsigned int *keysUnsorted, unsigned int len) { bool passed = true; for(unsigned int i=0; i<len-1; ++i) { if( (keysSorted[i])>(keysSorted[i+1]) ) { shrLog("Unordered key[%d]: %d > key[%d]: %d\n", i, keysSorted[i], i+1, keysSorted[i+1]); passed = false; break; } } if (valuesSorted) { for(unsigned int i=0; i<len; ++i) { if( keysUnsorted[valuesSorted[i]] != keysSorted[i] ) { shrLog("Incorrectly sorted value[%u] (%u): %u != %u\n", i, valuesSorted[i], keysUnsorted[valuesSorted[i]], keysSorted[i]); passed = false; break; } } } return passed; }
void initGL( int *argc, char **argv ) { // initialize GLUT glutInit(argc, argv); glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); glutInitWindowSize(768, 768); glutCreateWindow("CUDA Box Filter"); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); glutTimerFunc(REFRESH_DELAY, timerEvent, 0); glewInit(); if (g_bFBODisplay) { if (!glewIsSupported( "GL_VERSION_2_0 GL_ARB_fragment_program GL_EXT_framebuffer_object" )) { shrLog("Error: failed to get minimal extensions for demo\n"); shrLog("This sample requires:\n"); shrLog(" OpenGL version 2.0\n"); shrLog(" GL_ARB_fragment_program\n"); shrLog(" GL_EXT_framebuffer_object\n"); exit(-1); } } else { if (!glewIsSupported( "GL_VERSION_1_5 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object" )) { shrLog("Error: failed to get minimal extensions for demo\n"); shrLog("This sample requires:\n"); shrLog(" OpenGL version 1.5\n"); shrLog(" GL_ARB_vertex_buffer_object\n"); shrLog(" GL_ARB_pixel_buffer_object\n"); exit(-1); } } }
////////////////////////////////////////////////////////////////////////////// //! Gets the platform ID for NVIDIA if available, otherwise default //! //! @return the id //! @param clSelectedPlatformID OpenCL platoform ID ////////////////////////////////////////////////////////////////////////////// cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID) { char chBuffer[1024]; cl_uint num_platforms; cl_platform_id* clPlatformIDs; cl_int ciErrNum; *clSelectedPlatformID = NULL; // Get OpenCL platform count ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum); return -1000; } else { if(num_platforms == 0) { shrLog("No OpenCL platform found!\n\n"); return -2000; } else { // if there's a platform or more, make space for ID's if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL) { shrLog("Failed to allocate memory for cl_platform ID's!\n\n"); return -3000; } // get platform info for each platform and trap the NVIDIA platform if found ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL); for(cl_uint i = 0; i < num_platforms; ++i) { ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL); if(ciErrNum == CL_SUCCESS) { if(strstr(chBuffer, "NVIDIA") != NULL) { *clSelectedPlatformID = clPlatformIDs[i]; break; } } } // default to zeroeth platform if NVIDIA not found if(*clSelectedPlatformID == NULL) { shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n"); *clSelectedPlatformID = clPlatformIDs[0]; } free(clPlatformIDs); } } return CL_SUCCESS; }
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("Loading ConvolutionSeparable.cl...\n"); char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]); oclCheckError(cPathAndName != NULL, shrTRUE); char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength); oclCheckError(cConvolutionSeparable != NULL, shrTRUE); shrLog("Creating convolutionSeparable program...\n"); cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Building convolutionSeparable program...\n"); char compileOptions[2048]; #ifdef _WIN32 sprintf_s(compileOptions, 2048, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #else sprintf(compileOptions, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #endif ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cqDefaultCommandQueue = cqParamCommandQueue; free(cConvolutionSeparable); }
//----------------------------------------------------------------------------- // Name: InitD3D9() // Desc: Initializes Direct3D9 //----------------------------------------------------------------------------- HRESULT InitD3D9(HWND hWnd) { // Create the D3D object. if( NULL == ( g_pD3D = Direct3DCreate9( D3D_SDK_VERSION ) ) ) { shrLog("No Direct3D9 device available\n"); Cleanup(EXIT_SUCCESS); } // Find the first CL capable device for(g_iAdapter = 0; g_iAdapter < g_pD3D->GetAdapterCount(); g_iAdapter++) { D3DCAPS9 caps; if (FAILED(g_pD3D->GetDeviceCaps(g_iAdapter, D3DDEVTYPE_HAL, &caps))) // Adapter doesn't support Direct3D continue; if(FAILED(g_pD3D->GetAdapterIdentifier(g_iAdapter, 0, &g_adapter_id))) return E_FAIL; break; } // we check to make sure we have found a OpenCL-compatible D3D device to work on if(g_iAdapter == g_pD3D->GetAdapterCount() ) { shrLog("No OpenCL-compatible Direct3D9 device available\n"); // destroy the D3D device g_pD3D->Release(); Cleanup(EXIT_SUCCESS); } // Create the D3D Display Device RECT rc; GetClientRect(hWnd,&rc); D3DDISPLAYMODE d3ddm; g_pD3D->GetAdapterDisplayMode(g_iAdapter, &d3ddm); D3DPRESENT_PARAMETERS d3dpp; ZeroMemory( &d3dpp, sizeof(d3dpp) ); d3dpp.Windowed = TRUE; d3dpp.BackBufferCount = 1; d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD; d3dpp.hDeviceWindow = hWnd; d3dpp.BackBufferWidth = g_WindowWidth; d3dpp.BackBufferHeight = g_WindowHeight; d3dpp.BackBufferFormat = d3ddm.Format; if (FAILED (g_pD3D->CreateDevice (g_iAdapter, D3DDEVTYPE_HAL, hWnd, D3DCREATE_HARDWARE_VERTEXPROCESSING, &d3dpp, &g_pD3DDevice) )) return E_FAIL; // We clear the back buffer g_pD3DDevice->BeginScene(); g_pD3DDevice->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0); g_pD3DDevice->EndScene(); return S_OK; }
void showHelp() { shrLog("\n> Command line options\n"); shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); shrLog("\t-fp64 (use double precision floating point values for simulation)\n"); shrLog("\t-hostmem (stores simulation data in host memory)\n"); shrLog("\t-benchmark (run benchmark to measure perfomance) \n"); shrLog("\t-n=<numBodies> (set particle number) \n"); shrLog("\t-compare (compare to CPU results)\n"); shrLog("\t-device=<d> (where d=0,1,2.... for the CUDA device to use)\n"); shrLog("\t-numdevices=<n> (where n is the number of CUDA devices to use for simulation)\n"); shrLog("\t-cpu (run n-body simulation on the CPU)\n\n"); }
// Keyboard events handler //***************************************************************************** void KeyboardGL(unsigned char key, int x, int y) { switch(key) { case 'P': // P toggles Processing between CPU and GPU case 'p': // p toggles Processing between CPU and GPU if (iProcFlag == 0) { iProcFlag = 1; } else { iProcFlag = 0; } shrLog("\n%s Processing...\n", cProcessor[iProcFlag]); break; case ' ': // space bar toggles processing on and off bPostprocess = !bPostprocess; shrLog("\nPostprocessing (Blur Filter) Toggled %s...\n", bPostprocess ? "ON" : "OFF"); break; case 'A': // 'A' toggles animation (spinning of teacup) on/off case 'a': // 'a' toggles animation (spinning of teacup) on/off bAnimate = !bAnimate; shrLog("\nGL Animation (Rotation) Toggled %s...\n", bAnimate ? "ON" : "OFF"); break; case '=': case '+': if (blur_radius < 16) blur_radius++; shrLog("\nBlur radius = %d\n", blur_radius); break; case '-': case '_': if (blur_radius > 1) blur_radius--; shrLog("\nBlur radius = %d\n", blur_radius); break; case '\033': // escape quits case '\015': // Enter quits case 'Q': // Q quits case 'q': // q (or escape) quits // Cleanup then quit (without prompting) bNoPrompt = shrTRUE; Cleanup(EXIT_SUCCESS); break; } // Trigger fps update and call for refresh TriggerFPSUpdate(); glutPostRedisplay(); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { shrQAStart(argc, argv); // start the logs shrSetLogFileName ("oclMatrixMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // run the code bool bOK = (runTest(argc, (const char **)argv) == CL_SUCCESS); shrLog("%s\n\n", (bOK ? "PASSED" : "FAILED")); // finish shrQAFinishExit(argc, (const char **)argv, (bOK ? QA_PASSED : QA_FAILED)); }
////////////////////////////////////////////////////////////////////// // readback // ////////////////////////////////////////////////////////////////////// bool CheckBackBuffer::readback( GLuint width, GLuint height ) { bool ret = false; if (m_bUsePBO) { // binds the PBO for readback bindReadback(); // Initiate the readback BLT from BackBuffer->PBO->membuf glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, BUFFER_OFFSET(0)); ret = checkStatus(__FILE__, __LINE__, true); if (!ret) shrLog("CheckBackBuffer::glReadPixels() checkStatus = %d\n", ret); // map - unmap simulates readback without the copy void *ioMem = glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); memcpy(m_pImageData, ioMem, width*height*m_Bpp); glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); // release the PBO unbindReadback(); } else { // reading direct from the backbuffer glReadBuffer(GL_FRONT); glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, m_pImageData); } return ret; }
// Cleanup and exit code // ********************************************************************* void Cleanup(int iExitCode) { // Cleanup allocated objects shrLog("Starting Cleanup...\n\n"); if(cdDevices)free(cdDevices); if(cPathAndName)free(cPathAndName); if(cSourceCL)free(cSourceCL); if(ceEvent)clReleaseEvent(ceEvent); if(ckKernel)clReleaseKernel(ckKernel); if(cpProgram)clReleaseProgram(cpProgram); if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue); if(cxGPUContext)clReleaseContext(cxGPUContext); if (cmM)clReleaseMemObject(cmM); if (cmV)clReleaseMemObject(cmV); if (cmW)clReleaseMemObject(cmW); // Free host memory free(M); free(V); free(W); free(Golden); shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", cExecutableName); exit (iExitCode); }
void CheckRender::savePGM( const char *zfilename, bool bInvert, void **ppReadBuf ) { if (zfilename != NULL) { if (bInvert) { unsigned char *readBuf; unsigned char *writeBuf= (unsigned char *)malloc(m_Width * m_Height); for (unsigned int y=0; y < m_Height; y++) { if (ppReadBuf) { readBuf = *(unsigned char **)ppReadBuf; } else { readBuf = (unsigned char *)m_pImageData; } memcpy(&writeBuf[m_Width*m_Bpp*y], (readBuf+ m_Width*(m_Height-1-y)), m_Width); } // we copy the results back to original system buffer if (ppReadBuf) { memcpy(*ppReadBuf, writeBuf, m_Width*m_Height); } else { memcpy(m_pImageData, writeBuf, m_Width*m_Height); } free (writeBuf); } shrLog("\n> Saving PGM: <%s>\n", zfilename); if (ppReadBuf) { shrSavePGMub(zfilename, *(unsigned char **)ppReadBuf, m_Width, m_Height); } else { shrSavePGMub(zfilename, (unsigned char *)m_pImageData, m_Width, m_Height); } } }
bool CheckRender::PPMvsPPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { char *ref_file_path = shrFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { shrLog("\nCheckRender::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); return false; } if (src_file == NULL || ref_file == NULL) { shrLog("\nCheckRender::PPMvsPPM: Aborting comparison\n"); return false; } return (shrComparePPM(src_file, ref_file_path, epsilon, threshold) == shrTRUE ? true : false); }
void Cleanup (int iExitCode) { // Cleanup allocated objects shrLog("Starting Cleanup...\n\n"); if(cPathAndName)free(cPathAndName); if(cSourceCL)free(cSourceCL); if(ckKernel)clReleaseKernel(ckKernel); if(cpProgram)clReleaseProgram(cpProgram); if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue); if(cxGPUContext)clReleaseContext(cxGPUContext); if(cmDevSrcA)clReleaseMemObject(cmDevSrcA); if(cmDevSrcB)clReleaseMemObject(cmDevSrcB); if(cmDevDst)clReleaseMemObject(cmDevDst); // Free host memory free(srcA); free(srcB); free (dst); free(Golden); // finalize logs and leave if (bNoPrompt) { shrLogEx(LOGBOTH | CLOSELOG, 0, "oclVectorAdd.exe Exiting...\n"); } else { shrLogEx(LOGBOTH | CLOSELOG, 0, "oclVectorAdd.exe Exiting...\nPress <Enter> to Quit\n"); getchar(); } exit (iExitCode); }
void keyboard(unsigned char key, int /*x*/, int /*y*/) { switch(key) { case 27: exit(0); break; case '=': case '+': if (filter_radius < (int)width-1 && filter_radius < (int)height-1) { filter_radius++; } break; case '-': if (filter_radius > 1) filter_radius--; break; case ']': iterations++; break; case '[': if (iterations>1) iterations--; break; default: break; } shrLog("radius = %d, iterations = %d\n", filter_radius, iterations); }
bool _compareResults(int numBodies) { assert(m_nbodyCuda); bool passed = true; m_nbody->update(0.001f); { m_nbodyCpu = new BodySystemCPU<T>(numBodies); m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos); m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel); m_nbodyCpu->update(0.001f); T* cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION); T* cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION); T tolerance = 0.0005f; for (int i = 0; i < numBodies; i++) { // if (((i + 1) % 4) && fabs(cpuPos[i] - cudaPos[i]) > tolerance) if (((i + 1) % 4) && fabs(cpuPos[i] - cudaPos[i]) > tolerance) { passed = false; shrLog("Error: element %d: (host)%f != (device)%f\n", i, cpuPos[i], cudaPos[i]); } } } return passed; }
void ParticleSystem::_initialize(int numParticles){ assert(!m_bInitialized); m_numParticles = numParticles; //Allocate host storage m_hPos = (float *)malloc(m_numParticles * 4 * sizeof(float)); m_hVel = (float *)malloc(m_numParticles * 4 * sizeof(float)); m_hReorderedPos = (float *)malloc(m_numParticles * 4 * sizeof(float)); m_hReorderedVel = (float *)malloc(m_numParticles * 4 * sizeof(float)); m_hHash = (uint *)malloc(m_numParticles * sizeof(uint)); m_hIndex = (uint *)malloc(m_numParticles * sizeof(uint)); m_hCellStart = (uint *)malloc(m_numGridCells * sizeof(uint)); m_hCellEnd = (uint *)malloc(m_numGridCells * sizeof(uint)); memset(m_hPos, 0, m_numParticles * 4 * sizeof(float)); memset(m_hVel, 0, m_numParticles * 4 * sizeof(float)); memset(m_hCellStart, 0, m_numGridCells * sizeof(uint)); memset(m_hCellEnd, 0, m_numGridCells * sizeof(uint)); //Allocate GPU data shrLog("Allocating GPU Data buffers...\n\n"); allocateArray(&m_dPos, m_numParticles * 4 * sizeof(float)); allocateArray(&m_dVel, m_numParticles * 4 * sizeof(float)); allocateArray(&m_dReorderedPos, m_numParticles * 4 * sizeof(float)); allocateArray(&m_dReorderedVel, m_numParticles * 4 * sizeof(float)); allocateArray(&m_dHash, m_numParticles * sizeof(uint)); allocateArray(&m_dIndex, m_numParticles * sizeof(uint)); allocateArray(&m_dCellStart, m_numGridCells * sizeof(uint)); allocateArray(&m_dCellEnd, m_numGridCells * sizeof(uint)); if (!m_bQATest) { //Allocate VBO storage m_posVbo = createVBO(m_numParticles * 4 * sizeof(float)); m_colorVBO = createVBO(m_numParticles * 4 * sizeof(float)); //Fill color buffer glBindBufferARB(GL_ARRAY_BUFFER, m_colorVBO); float *data = (float *)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY); float *ptr = data; for(uint i = 0; i < m_numParticles; i++){ float t = (float)i / (float) m_numParticles; #if 0 *ptr++ = rand() / (float) RAND_MAX; *ptr++ = rand() / (float) RAND_MAX; *ptr++ = rand() / (float) RAND_MAX; #else colorRamp(t, ptr); ptr += 3; #endif *ptr++ = 1.0f; } glUnmapBufferARB(GL_ARRAY_BUFFER); } setParameters(&m_params); setParametersHost(&m_params); m_bInitialized = true; }
////////////////////////////////////////////////////////////////////////////// //! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device //! //! @param cpProgram OpenCL program //! @param cdDevice device of interest ////////////////////////////////////////////////////////////////////////////// void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice) { // write out the build log and ptx, then exit char cBuildLog[10240]; clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE); }
void loadImageData(int argc, char **argv) { // load image (needed so we can get the width and height before we create the window char* image_path = NULL; if (argc >= 1) image_path = shrFindFilePath(image_filename, argv[0]); if (image_path == 0) { shrLog("Error finding image file '%s'\n", image_filename); exit(EXIT_FAILURE); } cutilCheckError(cutLoadPPM4ub(image_path, (unsigned char **) &h_img, &width, &height)); if (!h_img) { shrLog("Error opening file '%s'\n", image_path); exit(-1); } shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) m_nbody->update(activeParams.m_timestep); if (useCpu) { cutCreateTimer(&timer); cutStartTimer(timer); } else { cutilSafeCall(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { cutStopTimer(timer); milliseconds = cutGetTimerValue(timer); cutDeleteTimer(timer); } else { cutilSafeCall(cudaEventRecord(stopEvent, 0)); cutilSafeCall(cudaEventSynchronize(stopEvent)); cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); shrLog("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); shrLog("= %.3f billion interactions per second\n", interactionsPerSecond); shrLog("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) { CUresult error_result = cuDeviceGetAttribute( attribute, device_attribute, device ); if (error_result != CUDA_SUCCESS) { shrLog( "cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result) ); exit(0); } }
void b2CLCommonData::initReadLastJointImpulses() { if (b2clGlobal_OpenCLSupported) { printf("Initializing b2CLCommonData...\n"); int err; //load opencl programs from files char* commonDataSource = 0; size_t commonDataSourceLen = 0; shrLog("...loading b2CLCommonData.cl\n"); #ifdef linux commonDataSource = b2clLoadProgSource(shrFindFilePath("/opt/apps/com.samsung.browser/include/Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #elif defined (_WIN32) commonDataSource = b2clLoadProgSource(shrFindFilePath("../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #elif defined (__EMSCRIPTEN__) commonDataSource = b2clLoadProgSource(shrFindFilePath("./Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #else commonDataSource = b2clLoadProgSource(shrFindFilePath("../../../Box2D/Common/OpenCL/b2CLCommonData.cl", NULL), "// My comment\n", &commonDataSourceLen); #endif if(commonDataSource == NULL) { b2Log("Could not load program source, is path 'b2CLCommonData.cl' correct?"); } //create the compute program from source kernel code commonDataProgram = clCreateProgramWithSource(b2CLDevice::instance().GetContext(), 1, (const char**) &commonDataSource, NULL, &err); if (!commonDataProgram) { printf("Error: Failed to create compute program!\n"); exit(1); } //build the program err = clBuildProgram(commonDataProgram, 0, NULL, OPENCL_BUILD_PATH, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[20480]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(commonDataProgram, b2CLDevice::instance().GetCurrentDevice(), CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } //create the compute kernel readLastJointImpulseKernel = clCreateKernel(commonDataProgram, "ReadLastJointImpulses", &err); if (!readLastJointImpulseKernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } b2CLDevice::instance().getMaximumKernelWorkGroupSize(readLastJointImpulseKernel, maxWorkGroupSizeForreadLastJointImpulse); } }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// void runBenchmark( int argc, char **argv ) { int devID = 0; shrLog("[runBenchmark]: [%s]\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads); cutilSafeCall( cutilDeviceSynchronize() ); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); shrDeltaT(2); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads); } // check if kernel execution generated an error and sync host cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED"); cutilSafeCall(cutilDeviceSynchronize()); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); shrLog("\n"); }
void CheckRender::bindReadback() { if (!m_bQAReadback) { shrLog("CheckRender::bindReadback() uninitialized!\n"); return; } if (m_bUsePBO) { glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, m_pboReadback); // Bind the PBO } }
void CheckRender::unbindReadback() { if (!m_bQAReadback) { shrLog("CheckRender::unbindReadback() uninitialized!\n"); return; } if (m_bUsePBO) { glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0); // Release the bind on the PBO } }
////////////////////////////////////////////////////////////////////// // readback // // Code to handle reading back of the FBO data // ////////////////////////////////////////////////////////////////////// bool CheckFBO::readback( GLuint width, GLuint height ) { bool ret = false; if (m_bUsePBO) { // binds the PBO for readback bindReadback(); // bind FBO buffer (we want to transfer FBO -> PBO) glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_pFrameBufferObject->getFbo()); // Now initiate the readback to PBO glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, BUFFER_OFFSET(0)); ret = checkStatus(__FILE__, __LINE__, true); if (!ret) shrLog("CheckFBO::readback() FBO->PBO checkStatus = %d\n", ret); // map - unmap simulates readback without the copy void *ioMem = glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); memcpy(m_pImageData, ioMem, width*height*m_Bpp); glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); // release the FBO glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); // release the PBO unbindReadback(); } else { // Reading back from FBO using glReadPixels glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_pFrameBufferObject->getFbo()); ret = checkStatus(__FILE__, __LINE__, true); if (!ret) shrLog("CheckFBO::readback::glBindFramebufferEXT() checkStatus = %d\n", ret); glReadBuffer(static_cast<GLenum>(GL_COLOR_ATTACHMENT0_EXT)); ret &= checkStatus(__FILE__, __LINE__, true); if (!ret) shrLog("CheckFBO::readback::glReadBuffer() checkStatus = %d\n", ret); glReadPixels(0, 0, width, height, getPixelFormat(), GL_UNSIGNED_BYTE, m_pImageData); glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); } return CHECK_FBO; }
///////////////////////////////////////////////////////// //print results in an easily read format //////////////////////////////////////////////////////// void printResultsReadable(unsigned int *memSizes, double* bandwidths, unsigned int count, memcpyKind kind, accessMode accMode, memoryMode memMode, int iNumDevs) { // log config information if (kind == DEVICE_TO_DEVICE) { shrLog("Device to Device Bandwidth, %i Device(s)\n", iNumDevs); } else { if (kind == DEVICE_TO_HOST) { shrLog("Device to Host Bandwidth, %i Device(s), ", iNumDevs); } else if (kind == HOST_TO_DEVICE) { shrLog("Host to Device Bandwidth, %i Device(s), ", iNumDevs); } if(memMode == PAGEABLE) { shrLog("Paged memory"); } else if (memMode == PINNED) { shrLog("Pinned memory"); } if(accMode == DIRECT) { shrLog(", direct access\n"); } else if (accMode == MAPPED) { shrLog(", mapped access\n"); } } shrLog(" Transfer Size (Bytes)\tBandwidth(MB/s)\n"); unsigned int i; for(i = 0; i < (count - 1); i++) { shrLog(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]); } shrLog(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]); }
bool CheckBackBuffer::checkStatus(const char *zfile, int line, bool silent) { GLenum nErrorCode = glGetError(); if (nErrorCode != GL_NO_ERROR) { if (!silent) shrLog("Assertion failed(%s,%d): %s\n", zfile, line, gluErrorString(nErrorCode)); } return true; }