// Initialization code to find the best CUDA Device int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameters\n"); exit(-1); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); exit(-1); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors( cudaSetDevice( devID ) ); checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) ); printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name); } return devID; }
// initialize the given CUDA device and add it to the array of initialized devices; // if the given device is -1, the best available device is selected CUresult initDevice(int deviceID) { Device device; if (deviceID >= 0) { device.ID = gpuCheckDeviceId(deviceID); } else { // Otherwise pick the device with the highest Gflops/s device.ID = gpuGetMaxGflopsDeviceId(); } if (device.ID < 0) { VERBOSE("error: no CUDA capable devices found"); return CUDA_ERROR_NO_DEVICE; } checkCudaErrors(cudaSetDevice(device.ID)); checkCudaErrors(cudaGetDeviceProperties(&device.prop, device.ID)); if (device.prop.major < 2) { VERBOSE("CUDA error: compute capability 2.0 or greater required (available %d.%d for device[%d])", device.ID, device.prop.major, device.prop.minor); return CUDA_ERROR_INVALID_DEVICE; } devices.Insert(device); DEBUG("CUDA device[%d] initialized: %s (Compute Capability %d.%d)", device.ID, device.prop.name, device.prop.major, device.prop.minor); #if 1 // dummy memory allocation to work around a bug inside CUDA // (this seems to initialize some more things) void* cpDummy; cudaMalloc(&cpDummy, sizeof(int)); cudaFree(cpDummy); #endif return CUDA_SUCCESS; }
void initCuda(){ // Use device with highest Gflops/s // Had to update this to remove cutil version cudaGLSetGLDevice( gpuGetMaxGflopsDeviceId() ); initPBO(&pbo); // Clean up on program exit atexit(cleanupCuda); runCuda(); }
void initCuda(){ // Use device with highest Gflops/s #if CUDA_VERSION >= 5000 cudaGLSetGLDevice( gpuGetMaxGflopsDeviceId() ); #else cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); #endif initPBO(&pbo); // Clean up on program exit atexit(cleanupCuda); runCuda(); }
// Initialization code to find the best CUDA Device inline int findCudaDevice(int argc, const char **argv) { cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it if (checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); if (devID < 0) { printf("Invalid command line parameter\n "); exit(EXIT_FAILURE); } else { devID = gpuDeviceInit(devID); if (devID < 0) { printf("exiting...\n"); exit(EXIT_FAILURE); } } } else { // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); } return devID; }
void initialize(int argc, char **argv) { printf("[%s] (OpenGL Mode)\n", sSDKsample); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int devID; cudaDeviceProp deviceProps; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = gpuGLDeviceInit(argc, (const char **)argv); if (devID < 0) { printf("exiting...\n"); exit(EXIT_SUCCESS); } } else { devID = gpuGetMaxGflopsDeviceId(); cudaGLSetGLDevice(devID); } // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // Create the timer (for fps measurement) sdkCreateTimer(&timer); // load image from disk loadImageData(argc, argv); printf("\n" "\tControls\n" "\t=/- : Zoom in/out\n" "\tb : Run Benchmark g_FilterMode\n" "\tc : Draw Bicubic Spline Curve\n" "\t[esc] - Quit\n\n" "\tPress number keys to change filtering g_FilterMode:\n\n" "\t1 : nearest filtering\n" "\t2 : bilinear filtering\n" "\t3 : bicubic filtering\n" "\t4 : fast bicubic filtering\n" "\t5 : Catmull-Rom filtering\n\n" ); initGLBuffers(); #if USE_BUFFER_TEX fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode); if (!fprog) { exit(EXIT_SUCCESS); } #endif }
int main(int argc, char **argv) { char *dump_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutMainLoop(); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\nUsage: SobelFilter <options>\n"); printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { g_bQAReadback = true; runAutoTest(argc, argv); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n\n\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); sdkCreateTimer(&timer); sdkResetTimer(&timer); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); loadDefaultImage(argv[0]); // If code is not printing the USage, then we execute this path. printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); fflush(stdout); #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); }
void initializeCuda() { cudaError_t err = cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); }