void glcu::init_cuda() { cudaDeviceProp prop = {0}; int dev; prop.major = 1; prop.minor = 0; HANDLE_ERROR(cudaChooseDevice(&dev, &prop)); HANDLE_ERROR(cudaGLSetGLDevice(dev)); }
void CGLUtil::setCudaDeviceForGLInteroperation() { cudaDeviceProp sProp; memset( &sProp, 0, sizeof( cudaDeviceProp ) ); sProp.major = 1; sProp.minor = 0; int nDev; cudaSafeCall( cudaChooseDevice( &nDev, &sProp ) ); // tell CUDA which nDev we will be using for graphic interop // from the programming guide: Interoperability with OpenGL // requires that the CUDA nDeviceNO_ be specified by // cudaGLSetGLDevice() before any other runtime calls. //cudaSafeCall( cudaGLSetGLDevice( nDev ) ; return; }//setCudaDeviceForGLInteroperation()
int main(int argc, char **argv) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart( argc, argv ); shrSetLogFileName ("reduction.txt"); char *reduceMethod; cutGetCmdLineArgumentstr( argc, (const char**) argv, "method", &reduceMethod); char *typeChoice; cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice); if (0 == typeChoice) { typeChoice = (char*)malloc(4 * sizeof(char)); strcpy(typeChoice, "int"); } ReduceType datatype = REDUCE_INT; if (!strcasecmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcasecmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; cudaDeviceProp deviceProp; deviceProp.major = 1; deviceProp.minor = 0; int minimumComputeVersion = 10; if (datatype == REDUCE_DOUBLE) { deviceProp.minor = 3; minimumComputeVersion = 13; } int dev; if(!cutCheckCmdLineFlag(argc, (const char**)argv, "method") ) { fprintf(stderr, "MISSING --method FLAG.\nYou must provide --method={ SUM | MIN | MAX }.\n"); exit(1); } if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); cutilSafeCallNoSync(cudaGetDevice(&dev)); } else { cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp)); } cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev)); if((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) { shrLog("Using Device %d: %s\n\n", dev, deviceProp.name); cutilSafeCallNoSync(cudaSetDevice(dev)); } else { shrLog("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n", minimumComputeVersion / 10, minimumComputeVersion % 10); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } shrLog("Reducing array of type %s\n\n", typeChoice); bool bResult = false; switch (datatype) { default: case REDUCE_INT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<int>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<int>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<int>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_FLOAT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<float>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<float>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<float>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_DOUBLE: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<double>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<double>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<double>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; } cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (bResult ? QA_PASSED : QA_FAILED)); }
cudaError_t WINAPI wine_cudaChooseDevice( int *device, const struct cudaDeviceProp *prop ) { WINE_TRACE("\n"); return cudaChooseDevice( device, prop ); }
void cuda_choose_device( int *device, const struct cudaDeviceProp *prop) { check_cuda_error( cudaChooseDevice( device, prop)); }
/* GL interop test. Julia. */ TEST(GLInteropTest, Julia) { try { const int width = 1280; const int height = 720; const int bufWidth = 1920; const int bufHeight = 1080; // ------------------------------------------------------------ GLTestWindow window(width, height, true); GLContextParam param; param.DebugMode = true; param.Multisample = 8; GLContext context(window.Handle(), param); GLUtil::EnableDebugOutput(GLUtil::DebugOutputFrequencyLow); // ------------------------------------------------------------ // Choose device cudaDeviceProp prop; memset(&prop, 0, sizeof(cudaDeviceProp)); prop.major = 2; prop.minor = 0; int devID; HandleCudaError(cudaChooseDevice(&devID, &prop)); HandleCudaError(cudaGLSetGLDevice(devID)); // Get properties HandleCudaError(cudaGetDeviceProperties(&prop, devID)); // Create texture and PBO GLTexture2D texture; texture.SetMagFilter(GL_LINEAR); texture.SetMinFilter(GL_LINEAR); texture.SetWrap(GL_CLAMP_TO_EDGE); texture.Allocate(bufWidth, bufHeight, GL_RGBA8); GLPixelUnpackBuffer pbo; pbo.Allocate(bufWidth * bufHeight * 4, NULL, GL_DYNAMIC_DRAW); // Register cudaGraphicsResource* cudaPbo; HandleCudaError(cudaGraphicsGLRegisterBuffer(&cudaPbo, pbo.ID(), cudaGraphicsMapFlagsWriteDiscard)); // ------------------------------------------------------------ GLShader shader; shader.Compile("../resources/texturetest_simple2d.vert"); shader.Compile("../resources/texturetest_simple2d.frag"); shader.Link(); GLVertexArray vao; GLVertexBuffer positionVbo; GLIndexBuffer ibo; glm::vec3 v[] = { glm::vec3( 1.0f, 1.0f, 0.0f), glm::vec3(-1.0f, 1.0f, 0.0f), glm::vec3(-1.0f, -1.0f, 0.0f), glm::vec3( 1.0f, -1.0f, 0.0f) }; GLuint i[] = { 0, 1, 2, 2, 3, 0 }; positionVbo.AddStatic(12, &v[0].x); vao.Add(GLDefaultVertexAttribute::Position, &positionVbo); ibo.AddStatic(6, i); // ------------------------------------------------------------ double fps = 0.0; double timeSum = 0.0; double prevTime = GLTestUtil::CurrentTimeMilli(); int frameCount = 0; double start = GLTestUtil::CurrentTimeMilli(); float xcparam = -0.8f; float ycparam = 0.165f; float inc = 0.001f; while (window.ProcessEvent()) { // ------------------------------------------------------------ double currentTime = GLTestUtil::CurrentTimeMilli(); double elapsedTime = currentTime - prevTime; timeSum += elapsedTime; frameCount++; if (frameCount >= 13) { fps = 1000.0 * 13.0 / timeSum; timeSum = 0.0; frameCount = 0; } prevTime = currentTime; window.SetTitle((boost::format("GLInteropTest_Julia [FPS %.1f]") % fps).str()); // ------------------------------------------------------------ double elapsed = GLTestUtil::CurrentTimeMilli() - start; if (elapsed >= 1000.0) { break; } xcparam += inc; if (xcparam > -0.799f || xcparam < -0.811f) { inc *= -1.0f; } // ------------------------------------------------------------ HandleCudaError(cudaGraphicsMapResources(1, &cudaPbo, NULL)); // Get device pointer uchar4* devPtr; size_t bufferSize; HandleCudaError(cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &bufferSize, cudaPbo)); Run_GLInteropTestJuliaKernel(bufWidth, bufHeight, prop.multiProcessorCount, xcparam, ycparam, devPtr); HandleCudaError(cudaGraphicsUnmapResources(1, &cudaPbo, NULL)); texture.Replace(&pbo, glm::ivec4(0, 0, bufWidth, bufHeight), GL_RGBA, GL_UNSIGNED_BYTE); // ------------------------------------------------------------ glClearBufferfv(GL_COLOR, 0, glm::value_ptr(glm::vec4(0.0f))); glViewportIndexedfv(0, glm::value_ptr(glm::vec4(0, 0, width, height))); shader.Begin(); shader.SetUniform("tex", 0); texture.Bind(); vao.Draw(GL_TRIANGLES, &ibo); texture.Unbind(); shader.End(); context.SwapBuffers(); } cudaDeviceReset(); } catch (const GLException& e) { FAIL() << GLTestUtil::PrintGLException(e); } }