///////////////////////////////////////////////////// // Main program ///////////////////////////////////////////////////// int main(const int argc, const char **argv) { unsigned long start = START_SIZE; #ifdef NVS unsigned long end = END_NVS; #else /* sizeof(unsigned long) = 8 bytes */ unsigned long end = END_TITAN; #endif int stateDim = 1; /* default stateDim = 1 */ if (checkCmdLineFlag(argc, argv, "dim")) stateDim = getCmdLineArgumentInt(argc, argv, "dim"); char *typeInput = 0; getCmdLineArgumentString(argc, (const char**)argv, "type", &typeInput); if (0 != typeInput){ if (!strcasecmp(typeInput, "float")) runTest<float>(start, end, stateDim); else if (!strcasecmp(typeInput, "int")) runTest<int>(start, end, stateDim); else if (!strcasecmp(typeInput, "double")) runTest<double>(start, end, stateDim); } else runTest<double>(start, end, stateDim); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // run D3D10/CUDA test runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", SDK_name, device_name); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { fpsLimit = frameCheckNumber; getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } if (ref_file) { chooseCudaDevice(argc, argv, false); loadVolumeData(argv[0]); runAutoTest(ref_file, argv[0]); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); // use command-line specified CUDA device, otherwise use device with highest Gflops/s chooseCudaDevice(argc, argv, true); // OpenGL buffers initGLBuffers(); loadVolumeData(argv[0]); } printf("Press space to toggle animation\n" "Press '+' and '-' to change displayed slice\n"); #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // run D3D10/CUDA test runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", SDK_name, device_name); cudaDeviceReset(); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char device_name[256]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("> %s starting...\n", sSDKSample); if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } if (!dynlinkLoadD3D10API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D10 API libraries NOT found on.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D10 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D10API(); exit(EXIT_SUCCESS); } if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } runTest(argc, argv, ref_file); // // and exit // printf("%s running on %s exiting...\n", sSDKSample, device_name); printf("%s sample finished returned: %s\n", sSDKSample, (g_bPassed ? "OK" : "ERROR!")); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // start logs int devID; char *ref_file = NULL; printf("%s Starting...\n\n", argv[0]); #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (bilateralfilter) in Benchmarking mode g_TotalErrors += runBenchmark(argc, argv); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(argc, (char **)argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { dev = gpuGLDeviceInit(argc, (const char **)argv); if (dev == -1) { exit(EXIT_FAILURE); } } else { // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'e' and 'E' to change Euclidean delta\n" "Press 'g' and 'G' to changle Gaussian delta\n" "Press 'a' or 'A' to change Animation mode ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", argv[0]); char *typeInput = 0; getCmdLineArgumentString(argc, (const char **) argv, "type", &typeInput); ReduceType datatype = REDUCE_INT; if (0 != typeInput) { if (!strcasecmp(typeInput, "float")) { datatype = REDUCE_FLOAT; } else if (!strcasecmp(typeInput, "double")) { datatype = REDUCE_DOUBLE; } else if (strcasecmp(typeInput, "int")) { printf("Type %s is not recognized. Using default type int.\n\n", typeInput); } } cudaDeviceProp deviceProp; deviceProp.major = 1; deviceProp.minor = 0; int minimumComputeVersion = 10; if (datatype == REDUCE_DOUBLE) { deviceProp.minor = 3; minimumComputeVersion = 13; } int dev; dev = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); if ((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) { printf("Using Device %d: %s\n\n", dev, deviceProp.name); checkCudaErrors(cudaSetDevice(dev)); } else { printf("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n", minimumComputeVersion / 10, minimumComputeVersion % 10); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); } printf("Reducing array of type %s\n\n", getReduceTypeString(datatype)); bool bResult = false; switch (datatype) { default: case REDUCE_INT: bResult = runTest<int>(argc, argv, datatype); break; case REDUCE_FLOAT: bResult = runTest<float>(argc, argv, datatype); break; case REDUCE_DOUBLE: bResult = runTest<double>(argc, argv, datatype); break; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf(bResult ? "Test passed\n" : "Test failed!\n"); }
int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; printf("%s Starting...\n\n", sSDKsample); //start logs if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { fpsLimit = frameCheckNumber; getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } if (ref_file) { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { int device = findCudaDevice(argc, (const char **)argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } checkDeviceMeetComputeSpec(argc, argv); } else { int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaSetDevice(dev); } else { cudaDeviceReset(); exit(EXIT_SUCCESS); } } } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n -file=output.bin\n\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { exit(EXIT_SUCCESS); } } // load volume data initData(argc, argv); printf( "Press \n" " 'SPACE' to toggle animation\n" " 'p' to toggle pre-integrated transfer function\n" " '+' and '-' to change density (0.01 increments)\n" " ']' and '[' to change brightness\n" " ';' and ''' to modify transfer function offset\n" " '.' and ',' to modify transfer function scale\n\n"); if (ref_file) { runSingleTest(ref_file, argv[0]); } else { // This is the normal rendering path for VolumeRender glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); initPixelBuffer(); atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice); if (precisionChoice == NULL) { useDoublePrecision = 0; } else { if (!STRCASECMP(precisionChoice, "double")) { useDoublePrecision = 1; } else { useDoublePrecision = 0; } } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s int dev = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); int deviceIndex; checkCudaErrors(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if (useDoublePrecision && version < 13) { printf("Double precision not supported.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return 0; } printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if (useDoublePrecision) { initTable_SM13(tableCPU); } else { initTable_SM10(tableCPU); } printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { quasirandomGenerator_SM13(d_Output, 0, N); } else { quasirandomGenerator_SM10(d_Output, 0, N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); } else { inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { char *multiMethodChoice = NULL; char *scalingChoice = NULL; bool use_threads = true; bool bqatest = false; bool strongScaling = false; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { bqatest = true; } getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice); getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice); if (checkCmdLineFlag(argc, (const char **)argv, "h") || checkCmdLineFlag(argc, (const char **)argv, "help")) { usage(); exit(EXIT_SUCCESS); } if (multiMethodChoice == NULL) { use_threads = true; } else { if (!strcasecmp(multiMethodChoice, "threaded")) { use_threads = true; } else { use_threads = false; } } if (use_threads == false) { printf("Using single CPU thread for multiple GPUs\n"); } if (scalingChoice == NULL) { strongScaling = false; } else { if (!strcasecmp(scalingChoice, "strong")) { strongScaling = true; } else { strongScaling = false; } } //GPU number present in the system int GPU_N; checkCudaErrors(cudaGetDeviceCount(&GPU_N)); int nOptions = 256; nOptions = adjustProblemSize(GPU_N, nOptions); // select problem size int scale = (strongScaling) ? 1 : GPU_N; int OPT_N = nOptions * scale; int PATH_N = 262144; const unsigned long long SEED = 777; // initialize the timers hTimer = new StopWatchInterface*[GPU_N]; for (int i=0; i<GPU_N; i++) { sdkCreateTimer(&hTimer[i]); sdkResetTimer(&hTimer[i]); } //Input data array TOptionData *optionData = new TOptionData[OPT_N]; //Final GPU MC results TOptionValue *callValueGPU = new TOptionValue[OPT_N]; //"Theoretical" call values by Black-Scholes formula float *callValueBS = new float[OPT_N]; //Solver config TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; //OS thread ID CUTThread *threadID = new CUTThread[GPU_N]; int gpuBase, gpuIndex; int i; float time; double delta, ref, sumDelta, sumRef, sumReserve; printf("MonteCarloMultiGPU\n"); printf("==================\n"); printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); printf("Problem scaling = %s\n", strongScaling? "strong" : "weak"); printf("Number of GPUs = %d\n", GPU_N); printf("Total number of options = %d\n", OPT_N); printf("Number of paths = %d\n", PATH_N); printf("main(): generating input data...\n"); srand(123); for (i=0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for (i = 0; i < GPU_N; i++) { optionSolver[i].optionCount = OPT_N / GPU_N; } //Take into account cases with "odd" option counts for (i = 0; i < (OPT_N % GPU_N); i++) { optionSolver[i].optionCount++; } //Assign GPU option ranges gpuBase = 0; for (i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; // all devices use the same global seed, but start // the sequence at a different offset optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } if (use_threads || bqatest) { //Start CPU thread for each GPU for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); } printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics, threaded\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); time = sdkGetTimerValue(&hTimer[i]); printf("Total time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); } printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } if (!use_threads || bqatest) { multiSolver(optionSolver, GPU_N); printf("main(): GPU statistics, streamed\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } time = sdkGetTimerValue(&hTimer[0]); printf("\nTotal time (ms.): %f\n", time); printf("\tNote: This is elapsed time for all to compute.\n"); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("Shutting down...\n"); for (int i=0; i<GPU_N; i++) { sdkStartTimer(&hTimer[i]); checkCudaErrors(cudaSetDevice(i)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); } delete[] optionSolver; delete[] callValueBS; delete[] callValueGPU; delete[] optionData; delete[] threadID; delete[] hTimer; printf("Test Summary...\n"); printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { int devID = 0; char *ref_file = NULL; pArgc = &argc; pArgv = argv; // start logs printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **) argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (boxfilter) in Benchmarking mode g_TotalErrors += runBenchmark(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(); initGLResources(); // sets the callback function so it will call cleanup upon exit atexit(cleanup); printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'a' or 'A' to change animation ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParams(); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutCloseFunc(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
void initData(int argc, char **argv) { // parse arguments char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { volumeFilename = filename; } int n; if (checkCmdLineFlag(argc, (const char **) argv, "size")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "size"); volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (checkCmdLineFlag(argc, (const char **) argv, "xsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize"); volumeSize.width = n; } if (checkCmdLineFlag(argc, (const char **) argv, "ysize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize"); volumeSize.height = n; } if (checkCmdLineFlag(argc, (const char **) argv, "zsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize"); volumeSize.depth = n; } char *path = sdkFindFilePath(volumeFilename, argv[0]); if (path == 0) { printf("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); FilterKernel_init(); Volume_init(&volumeOriginal,volumeSize, h_volume, 0); free(h_volume); Volume_init(&volumeFilter0, volumeSize, NULL, 1); Volume_init(&volumeFilter1, volumeSize, NULL, 1); VolumeRender_init(); VolumeRender_setPreIntegrated(preIntegrated); VolumeRender_setVolume(&volumeOriginal); sdkCreateTimer(&timer); sdkCreateTimer(&animationTimer); sdkStartTimer(&animationTimer); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath; // set your own FreeImage error handler FreeImage_SetOutputMessage(FreeImageErrorHandler); cudaDeviceInit(argc, (const char **)argv); // Min spec is SM 1.0 devices if (printfNPPinfo(argc, argv, 1, 0) == false) { cudaDeviceReset(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "input")) { getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); } else { filePath = sdkFindFilePath("Lena.pgm", argv[0]); } if (filePath) { sFilename = filePath; } else { sFilename = "Lena.pgm"; } // if we specify the filename at the command line, then we only test sFilename // otherwise we will check both sFilename[0,1] int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_boxFilterFII.pgm"; if (checkCmdLineFlag(argc, (const char **)argv, "output")) { char *outputFilePath; getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); sResultFilename = outputFilePath; } FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); // no signature? try to guess the file format from the file extension if (eFormat == FIF_UNKNOWN) { eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); } NPP_ASSERT(eFormat != FIF_UNKNOWN); // check that the plugin has reading capabilities ... FIBITMAP *pBitmap; if (FreeImage_FIFSupportsReading(eFormat)) { pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); } NPP_ASSERT(pBitmap != 0); // Dump the bitmap information to the console std::cout << (*pBitmap) << std::endl; // make sure this is an 8-bit single channel image NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); unsigned char *pSrcData = FreeImage_GetBits(pBitmap); int nSrcPitchCUDA; Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); NPP_ASSERT_NOT_NULL(pSrcImageCUDA); // copy image loaded via FreeImage to into CUDA device memory, i.e. // transfer the image-data up to the GPU's video-memory NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, nImageWidth, nImageHeight, cudaMemcpyHostToDevice)); // define size of the box filter const NppiSize oMaskSize = {7, 7}; const NppiPoint oMaskAchnor = {0, 0}; // compute maximal result image size const NppiSize oSizeROI = {(int)nImageWidth - (oMaskSize.width - 1), (int)nImageHeight - (oMaskSize.height - 1) }; // allocate result image memory int nDstPitchCUDA; Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); NPP_ASSERT_NOT_NULL(pDstImageCUDA); NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA, pDstImageCUDA, nDstPitchCUDA, oSizeROI, oMaskSize, oMaskAchnor)); // create the result image storage using FreeImage so we can easily // save FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); NPP_ASSERT_NOT_NULL(pResultBitmap); unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, nDstPitchCUDA, oSizeROI.width, oSizeROI.height, cudaMemcpyDeviceToHost)); // now save the result image bool bSuccess; bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE; NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); //free nppiImage nppiFree(pSrcImageCUDA); nppiFree(pDstImageCUDA); cudaDeviceReset(); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); }
int main(int argc, char **argv) { char *dump_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutMainLoop(); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); findCudaGLDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", sSDKsample); numParticles = NUM_PARTICLES; maxNumParticles = MAX_NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; printf("Surely I can get this far\n"); if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } //******************************************************* // RMK Hard code for cylindrical coords (y=theta=1) // DomainSize //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-nothot-big-refine/RVert.txt"; char Zfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/ZVert.txt"; char Rfile[] = "/home/rkeedy/CFD/BuoyantStrumJet85-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet62-big-refine-lighter/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet85-big-refine/RVert.txt"; //char Zfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/ZVert.txt"; //char Rfile[] = "/home/rkeedy/Dropbox/CFD/BuoyantStrumJet63-big-refine/RVert.txt"; numVelNodes.x = filecount(Rfile); //-1; numVelNodes.z = filecount(Zfile); //-1; numVelNodes.y = 1; numCells.x = 80; //47; //24; //29; numCells.y = 1; numCells.z = 160; //188; //95; //88; numParticles = numCells.x*numCells.z*20; //avgnumparticles = 40 srand( time( NULL ) ); //numParticles = numCells.x*numCells.z*40; printf("vel grid: %d x %d x %d = %d cells\n", numVelNodes.x, numVelNodes.y, numVelNodes.z, numVelNodes.x*numVelNodes.y*numVelNodes.z); printf(" grid: %d x %d x %d = %d cells\n", numCells.x, numCells.y, numCells.z, numCells.x*numCells.y*numCells.z); //printf("vel grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } // Moved code snippet to CellSystem //initCellSystem(gridSize); // now moved to particlesystem printf("Begin initialization\n"); //initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParticleSystem(maxNumParticles, numParticles, numVelNodes, numCells, g_refFile==NULL); //printf("Finished with initParticleSystem, %d\n",g_refFile==NULL); //cin.ignore(); initParams(); printf("Finished with initialization\n"); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); atexit(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { char device_name[NAME_LEN]; char *ref_file = NULL; pArgc = &argc; pArgv = argv; printf("[%s] - Starting...\n", SDK_name); if (!findGraphicsGPU(device_name)) { printf("> %s not supported on \"%s\" exiting...\n", SDK_name, device_name); exit(EXIT_SUCCESS); } // command line options if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // // create window // // Register the window class #if 1 WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CUDA/D3D9 Texture InterOP", NULL }; RegisterClassEx(&wc); int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); int yMenu = ::GetSystemMetrics(SM_CYMENU); int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); // Create the application's window (padding by window border for uniform BB sizes across OSs) HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D9 Texture InterOP", WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu, NULL, NULL, wc.hInstance, NULL); #else static WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "CudaD3D9Tex", NULL }; RegisterClassEx(&wc); HWND hWnd = CreateWindow( "CudaD3D9Tex", "CUDA D3D9 Texture Interop", WS_OVERLAPPEDWINDOW, 0, 0, 800, 320, GetDesktopWindow(), NULL, wc.hInstance, NULL); #endif ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // Initialize Direct3D if (SUCCEEDED(InitD3D9(hWnd)) && SUCCEEDED(InitCUDA()) && SUCCEEDED(InitTextures())) { if (!g_bDeviceLost) { RegisterD3D9ResourceWithCUDA(); } } // // the main loop // while (false == g_bDone) { RunCUDA(); DrawScene(); // // handle I/O // MSG msg; ZeroMemory(&msg, sizeof(msg)); while (msg.message!=WM_QUIT) { if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { TranslateMessage(&msg); DispatchMessage(&msg); } else { RunCUDA(); DrawScene(); if (ref_file) { for (int count=0; count<g_iFrameToCompare; count++) { RunCUDA(); DrawScene(); } const char *cur_image_path = "simpleD3D9Texture.ppm"; // Save a reference of our current test run image CheckRenderD3D9::BackbufferToPPM(g_pD3DDevice, cur_image_path); // compare to offical reference image, printing PASS or FAIL. g_bPassed = CheckRenderD3D9::PPMvsPPM(cur_image_path, ref_file, argv[0], MAX_EPSILON, 0.15f); g_bDone = true; Cleanup(); PostQuitMessage(0); } } } }; // Unregister windows class UnregisterClass(wc.lpszClassName, wc.hInstance); // // and exit // printf("> %s running on %s exiting...\n", SDK_name, device_name); exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaGLDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); if (ref_file) { autoTest(argv); cleanup(); cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); cudaDeviceReset(); exit(EXIT_FAILURE); }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }