/////////////////////////////////////////////////////////////////////////////// /// application entry point /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // welcome message printf("%s Starting...\n\n", sSDKsample); // pick GPU findCudaDevice(argc, (const char **)argv); // find images const char *const sourceFrameName = "frame10.ppm"; const char *const targetFrameName = "frame11.ppm"; // image dimensions int width; int height; // row access stride int stride; // flow is computed from source image to target image float *h_source; // source image, host memory float *h_target; // target image, host memory // load image from file if (!LoadImageAsFP32(h_source, width, height, stride, sourceFrameName, argv[0])) { exit(EXIT_FAILURE); } if (!LoadImageAsFP32(h_target, width, height, stride, targetFrameName, argv[0])) { exit(EXIT_FAILURE); } // allocate host memory for CPU results float *h_uGold = new float [stride * height]; float *h_vGold = new float [stride * height]; // allocate host memory for GPU results float *h_u = new float [stride * height]; float *h_v = new float [stride * height]; // smoothness // if image brightness is not within [0,1] // this paramter should be scaled appropriately const float alpha = 0.2f; // number of pyramid levels const int nLevels = 5; // number of solver iterations on each level const int nSolverIters = 500; // number of warping iterations const int nWarpIters = 3; ComputeFlowGold(h_source, h_target, width, height, stride, alpha, nLevels, nWarpIters, nSolverIters, h_uGold, h_vGold); ComputeFlowCUDA(h_source, h_target, width, height, stride, alpha, nLevels, nWarpIters, nSolverIters, h_u, h_v); // compare results (L1 norm) bool status = CompareWithGold(width, height, stride, h_uGold, h_vGold, h_u, h_v); WriteFloFile("FlowGPU.flo", width, height, stride, h_u, h_v); WriteFloFile("FlowCPU.flo", width, height, stride, h_uGold, h_vGold); // free resources delete [] h_uGold; delete [] h_vGold; delete [] h_u; delete [] h_v; delete [] h_source; delete [] h_target; // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); // report self-test status exit(status ? EXIT_SUCCESS : EXIT_FAILURE); }
void runCuda(){ // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer if(iterations < renderCam->iterations) { uchar4 *dptr=NULL; iterations++; cudaGLMapBufferObject((void**)&dptr, pbo); //pack geom and material arrays geom* geoms = new geom[renderScene->objects.size()]; material* materials = new material[renderScene->materials.size()]; for(int i=0; i<renderScene->objects.size(); i++){ geoms[i] = renderScene->objects[i]; } for(int i=0; i<renderScene->materials.size(); i++){ materials[i] = renderScene->materials[i]; } // execute the kernel cudaRaytraceCore(dptr, renderCam, targetFrame, iterations, materials, renderScene->materials.size(), geoms, renderScene->objects.size() ); // unmap buffer object cudaGLUnmapBufferObject(pbo); } else { if(!finishedRender){ //output image file image outputImage(renderCam->resolution.x, renderCam->resolution.y); for(int x=0; x<renderCam->resolution.x; x++){ for(int y=0; y<renderCam->resolution.y; y++){ int index = x + (y * renderCam->resolution.x); outputImage.writePixelRGB(renderCam->resolution.x-1-x,y,renderCam->image[index]); } } gammaSettings gamma; gamma.applyGamma = false; gamma.gamma = 1.0/2.2; gamma.divisor = 1.0; outputImage.setGammaSettings(gamma); string filename = renderCam->imageName; string s; stringstream out; out << targetFrame; s = out.str(); utilityCore::replaceString(filename, ".bmp", "."+s+".bmp"); utilityCore::replaceString(filename, ".png", "."+s+".png"); outputImage.saveImageRGB(filename); cout << "Saved frame " << s << " to " << filename << endl; finishedRender = true; if(singleFrameMode==true){ cudaDeviceReset(); exit(0); } } if(targetFrame < renderCam->frames-1){ //clear image buffer and move onto next frame targetFrame++; iterations = 0; for(int i=0; i<renderCam->resolution.x*renderCam->resolution.y; i++){ renderCam->image[i] = glm::vec3(0,0,0); } cudaDeviceReset(); finishedRender = false; } } }
// Can't be called directly in cpu-miner.c void cuda_devicereset() { cudaDeviceSynchronize(); cudaDeviceReset(); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { int devID = 0; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; // start logs printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { filter_radius = getCmdLineArgumentInt(argc, (const char **) argv, "radius"); } if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { iterations = getCmdLineArgumentInt(argc, (const char **) argv, "passes"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } } // load image to process loadImageData(argc, argv); if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { // This is a separate mode of the sample, where we are benchmark the kernels for performance devID = findCudaDevice(argc, (const char **)argv); // Running CUDA kernels (boxfilter) in Benchmarking mode g_TotalErrors += runBenchmark(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || checkCmdLineFlag(argc, (const char **)argv, "passes")) { // This overrides the default mode. Users can specify the radius used by the filter kernel devID = findCudaDevice(argc, (const char **)argv); g_TotalErrors += runSingleTest(ref_file, argv[0]); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { // Default mode running with OpenGL visualization and in automatic mode // the output automatically changes animation printf("\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } // Now we can create a CUDA context and bind it to the OpenGL context initCuda(true); initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif printf("Running Standard Demonstration with GLUT loop...\n\n"); printf("Press '+' and '-' to change filter width\n" "Press ']' and '[' to change number of iterations\n" "Press 'a' or 'A' to change animation ON/OFF\n\n"); // Main OpenGL loop that will run visualization for every vsync glutMainLoop(); } }
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath = sdkFindFilePath("Lena.pgm", argv[0]); if (filePath) { sFilename = filePath; } else { printf("Error unable to find Lena.pgm\n"); exit(EXIT_FAILURE); } // set your own FreeImage error handler FreeImage_SetOutputMessage(FreeImageErrorHandler); cudaDeviceInit(argc, (const char **)argv); // Min spec is SM 1.0 devices if (printfNPPinfo(argc, argv, 1, 0) == false) { cudaDeviceReset(); exit(EXIT_SUCCESS); } if (argc > 1) { sFilename = argv[1]; } // if we specify the filename at the command line, then we only test sFilename // otherwise we will check both sFilename[0,1] int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_boxFilterFII.pgm"; if (argc >= 3) { sResultFilename = argv[2]; } FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); // no signature? try to guess the file format from the file extension if (eFormat == FIF_UNKNOWN) { eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); } NPP_ASSERT(eFormat != FIF_UNKNOWN); // check that the plugin has reading capabilities ... FIBITMAP *pBitmap; if (FreeImage_FIFSupportsReading(eFormat)) { pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); } NPP_ASSERT(pBitmap != 0); // Dump the bitmap information to the console std::cout << (*pBitmap) << std::endl; // make sure this is an 8-bit single channel image NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); unsigned char *pSrcData = FreeImage_GetBits(pBitmap); int nSrcPitchCUDA; Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); NPP_ASSERT_NOT_NULL(pSrcImageCUDA); // copy image loaded via FreeImage to into CUDA device memory, i.e. // transfer the image-data up to the GPU's video-memory NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, nImageWidth, nImageHeight, cudaMemcpyHostToDevice)); // define size of the box filter const NppiSize oMaskSize = {7, 7}; const NppiPoint oMaskAchnor = {0, 0}; // compute maximal result image size const NppiSize oSizeROI = {nImageWidth - (oMaskSize.width - 1), nImageHeight - (oMaskSize.height - 1) }; // allocate result image memory int nDstPitchCUDA; Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); NPP_ASSERT_NOT_NULL(pDstImageCUDA); NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA, pDstImageCUDA, nDstPitchCUDA, oSizeROI, oMaskSize, oMaskAchnor)); // create the result image storage using FreeImage so we can easily // save FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); NPP_ASSERT_NOT_NULL(pResultBitmap); unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, nDstPitchCUDA, oSizeROI.width, oSizeROI.height, cudaMemcpyDeviceToHost)); // now save the result image bool bSuccess; bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE; NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); //free nppiImage nppiFree(pSrcImageCUDA); nppiFree(pDstImageCUDA); cudaDeviceReset(); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaError_t error; printf("%s Starting...\n\n", argv[0]); printf("Starting up CUDA context...\n"); int dev = findCudaDevice(argc, (const char **)argv); uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; StopWatchInterface *hTimer = NULL; const uint N = 1048576; const uint DIR = 0; const uint numValues = 65536; const uint numIterations = 1; printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_InputKey = (uint *)malloc(N * sizeof(uint)); h_InputVal = (uint *)malloc(N * sizeof(uint)); h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); srand(2001); for (uint i = 0; i < N; i++) { h_InputKey[i] = rand() % numValues; h_InputVal[i] = i; } printf("Allocating and initializing CUDA arrays...\n\n"); error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); int flag = 1; printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations); for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); uint threadCount = 0; for (uint i = 0; i < numIterations; i++) threadCount = bitonicSort( d_OutputKey, d_OutputVal, d_InputKey, d_InputVal, N / arrayLength, arrayLength, DIR ); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkStopTimer(&hTimer); printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations); if (arrayLength == N) { double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount); } printf("\nValidating the results...\n"); printf("...reading back GPU results\n"); error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR); int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); flag = flag && keysFlag && valuesFlag; printf("\n"); } printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); cudaFree(d_OutputVal); cudaFree(d_OutputKey); cudaFree(d_InputVal); cudaFree(d_InputKey); free(h_OutputValGPU); free(h_OutputKeyGPU); free(h_InputVal); free(h_InputKey); cudaDeviceReset(); exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported if (deviceCount >= 2) { cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPU's that can support P2P int gpu_p2p_count = 0; for (int i=0; i < deviceCount; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi or later can support P2P if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to supprot this && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } } // Show all the combinations of support P2P GPUs int can_access_peer_0_1, can_access_peer_1_0; if (gpu_p2p_count >= 2) { for (int i = 0; i < gpu_p2p_count-1; i++) { for (int j = 1; j < gpu_p2p_count; j++) { checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid[i], gpuid[j])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j] , can_access_peer_0_1 ? "Yes" : "No"); } } for (int j = 1; j < gpu_p2p_count; j++) { for (int i = 0; i < gpu_p2p_count-1; i++) { checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid[j], gpuid[i])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[j]].name, gpuid[j], prop[gpuid[i]].name, gpuid[i] , can_access_peer_1_0 ? "Yes" : "No"); } } } } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); printf("Result = PASS\n"); // finish // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { // needed to work correctly with piped benchmarkrunner setlinebuf(stdout); setlinebuf(stdin); int n_indices = 1; int n_dimensions = 1; char inBuf[200]; // ridiculously large input buffer. bool isFirst = true; do { // Allocate memory for the arrays int *h_indices = 0; double *h_outputGPU = 0; try { h_indices = new int [n_indices * n_dimensions]; h_outputGPU = new double [n_indices * n_dimensions]; } catch (std::exception e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } int *d_indices; double *d_output; try { cudaError_t cudaResult; cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int)); if (cudaResult != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(cudaResult)); } } catch (std::runtime_error e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } // Initialize the indices (done on the host) for(int i = 0; i < n_indices; i++) { h_indices[i] = i; } // Copy the indices to the device cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); // Execute the QRNG on the device int n_vec; sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec); cudaDeviceSynchronize(); cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost); // Cleanup and terminate delete h_indices; cudaFree(d_indices); cudaFree(d_output); if(!isFirst) { printf("RESULT "); for(int i = 0; i < std::min(n_indices,10); i++) printf("%f ", h_outputGPU[i]); printf("\n"); } else { printf("OK\n"); isFirst = false; } delete h_outputGPU; fgets(inBuf, 200, stdin); if (sscanf(inBuf, "%u", &n_indices) == 0) { // if input is not a number, it has to be "EXIT" if (strncmp("EXIT",inBuf,4)==0) { printf("OK\n"); break; } else { printf("ERROR. Bad input: %s\n", inBuf); break; } } } while (true); cudaDeviceReset(); return -1; }
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { int dev_id = device_map[thr_id]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const int swap = 1; // to toggle nonce endian uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8 if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0; if (!init[thr_id]) { cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); CUDA_SAFE_CALL(cudaDeviceSynchronize()); init[thr_id] = true; } uint32_t endiandata[20]; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); skein512_cpu_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { int order = 0; // Hash with CUDA skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; endiandata[19] = swab32_if(foundNonce, swap); skein2hash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); work_set_target_ratio(work, vhash64); if (secNonce != 0) { if (!opt_quiet) applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce)); endiandata[19] = swab32_if(secNonce, swap); skein2hash(vhash64, endiandata); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) work_set_target_ratio(work, vhash64); pdata[21] = swab32_if(secNonce, !swap); res++; } pdata[19] = swab32_if(foundNonce, !swap); return res; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce; return 0; }
int main(int argc, char **argv) { unsigned int n_particles = NUM_PARTICLES; namespace po = boost::program_options; po::options_description desc("Allowed options"); desc.add_options() ("help,h", "Produces help message") ("file,f", po::value<std::string>(), "Output file") (",n", po::value<unsigned int>(), "Particle number") ("algorithm,a", po::value<std::string>(), "Contact Detection algorithm") ("distribution,d", po::value<std::string>(), "Type of distribution (fluid, sparse or dense)"); po::variables_map vm; po::store(po::parse_command_line(argc,argv,desc), vm); po::notify(vm); if(vm.count("help")){ std::cout << desc << std::endl; return 0; } if(vm.count("-n")){ //std::cout << vm << std::endl; n_particles = vm["-n"].as<unsigned int>(); } NeighboorAlg neighAlg = DM; if(vm.count("algorithm")){ std::string alg = vm["algorithm"].as<std::string>(); if(alg=="DC") neighAlg = DC; else if(alg=="SCD") neighAlg = SCD; else if(alg=="DM") neighAlg = DM; else if(alg=="CM") neighAlg = CM; else if(alg=="SAS") neighAlg = SAS; } SystemType distr = DENSE; if(vm.count("distribution")){ std::string d = vm["distribution"].as<std::string>(); if(d=="dense") distr = DENSE; else if(d=="fluid") distr = FLUID; else if(d=="sparse") distr = SPARSE; } ParticleSystem *system = new ParticleSystem(n_particles, neighAlg, distr); std::cout << "Starting simulation..." << std::endl << "N = " << n_particles << std::endl; std::string out_file; if(vm.count("file")){ out_file = vm["file"].as<std::string>(); system->setOutputFile(out_file); std::cout << "Output = " << out_file << std::endl; } float total_time; system->printInfo(); total_time = system->run(); system->cleanUp(); std::cout << "Total time = " << total_time << " ms" << std::endl; //TODO Fazer grid-size relatico a quantidade de partículas e a // tipo de simulação por linha de comando //TODO botar para selecionar device por entrada tbm // cleanup delete system; cudaDeviceReset(); }
void initialize(std::string test_name) { m_test_name = test_name; if(LogToFile) { m_ofsLog.open("Log.txt"); m_ofsData.close(); if (m_ofsLog.is_open()) { std::cout << " File opened: " << "Log.txt" <<std::endl; } m_oLog.rdbuf(m_ofsLog.rdbuf()); } else { m_oLog.rdbuf(std::cout.rdbuf()); } m_filename = m_test_name + TTestVariant::getTestVariantDescription(); m_ofsData.close(); m_ofsData.open(m_filename+".dump"); if (m_ofsData.is_open()) { std::cout << " File opened: " << m_filename <<std::endl; } else { ERRORMSG("Could not open data file: " << m_filename); } m_oData.rdbuf(m_ofsData.rdbuf()); // Set GPU Device to use! m_oLog << " Set GPU Device: " << UseGPUDeviceID <<std::endl; cudaDeviceReset(); CHECK_CUDA(cudaSetDevice(UseGPUDeviceID)); m_testVariant.initialize(&m_oLog,&m_oData); m_oLog << " Kernel Performance Test ======================================="<<std::endl; m_oData<< "# Kernel Performance Test: Data Dump for file:" <<m_filename<<".xml" << std::endl; // Init XML m_dataXML.reset(); std::stringstream xml("<PerformanceTest type=\"KernelTest\">" "<Description></Description>" "<DataTable>" "<Header></Header>" "<Data></Data>" "</DataTable>" "</PerformanceTest>"); bool r = m_dataXML.load(xml); ASSERTMSG(r,"Could not load initial xml data file"); // DESCRIPTION ========================== // Add GPU INFO XMLNodeType descNode = m_dataXML.child("PerformanceTest").child("Description"); // Write header to file for this TestMethod! cudaDeviceProp props; CHECK_CUDA(cudaGetDeviceProperties(&props,UseGPUDeviceID)); std::stringstream s; utilCuda::writeCudaDeviceProbs(s,props,UseGPUDeviceID); auto gpuInfo = descNode.append_child("GPUInfos").append_child(XMLStringNode); gpuInfo.set_value(s.str().c_str()); // Write variant specific descriptions! auto desc = m_testVariant.getDescriptions(); for(auto & d : desc){ auto gpuInfo = descNode.append_child(d.first.c_str()).append_child(XMLStringNode); gpuInfo.set_value(d.second.c_str()); } // DATA ========================================= XMLNodeType dataTable = m_dataXML.child("PerformanceTest").child("DataTable"); auto headerNode = dataTable.child("Header"); // Write variant specific header { std::vector<std::string> head = m_testVariant.getDataColumHeader(); for(auto & h : head){ auto col = headerNode.append_child("Column").append_child(XMLStringNode); col.set_value(h.c_str()); } } { // Write TestMethod Specific Column Header! std::vector<std::string> head; head.push_back("nFlop"); head.push_back("GFlops"); head.push_back("Memory Bandwith [Bytes/sec]"); head.push_back("elapsedTimeCopyToGPU_Avg [s]"); head.push_back("gpuIterationTime_Avg [s]"); head.push_back("elapsedTimeCopyFromGPU_Avg [s]"); head.push_back("cpuIterationTime_Avg [s]"); head.push_back("nIterationsForTradeoff"); head.push_back("speedUpFactor"); head.push_back("maxRelTol_Avg (over all TestProblems)"); head.push_back("avgRelTol_Avg (over all TestProblems)"); head.push_back("maxUlp_Avg (over all TestProblems)"); head.push_back("avgUlp_Avg (over all TestProblems)"); for(auto & h : head){ auto col = headerNode.append_child("Column").append_child(XMLStringNode); col.set_value(h.c_str()); } } // Save XML already for savety! m_dataXML.save_file((m_filename+".xml").c_str()," "); }
int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("\nUsage: FunctionPointers (SobelFilter) <options>\n"); printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); exit(EXIT_WAIVED); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { g_bQAReadback = true; runAutoTest(argc, argv); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf(" This SDK does not explicitly support -device=n when running with OpenGL.\n"); printf(" When specifying -device=n (n=0,1,2,....) the sample must not use OpenGL.\n"); printf(" See details below to run without OpenGL:\n\n"); printf(" > %s -device=n\n\n", argv[0]); printf("exiting...\n"); exit(EXIT_WAIVED); } if (!g_bQAReadback) { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); int dev = findCapableDevice(argc, argv); checkDeviceMeetComputeSpec(argc, argv); if (dev != -1) { cudaGLSetGLDevice(dev); } else { exit(EXIT_WAIVED); } sdkCreateTimer(&timer); sdkResetTimer(&timer); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutReshapeFunc(reshape); loadDefaultImage(argv[0]); // If code is not printing the USage, then we execute this path. printf("I: display Image (no filtering)\n"); printf("T: display Sobel Edge Detection (Using Texture)\n"); printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); printf("Use the '-' and '=' keys to change the brightness.\n"); printf("b: switch block filter operation (Mean/Sobel)\n"); printf("p: switch point filter operation (Threshold ON/OFF)\n"); fflush(stdout); atexit(cleanup); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); } cudaDeviceReset(); exit(EXIT_SUCCESS); }
void initializeData(char *file) { GLint bsize; unsigned int w, h; size_t file_length= strlen(file); if (!strcmp(&file[file_length-3], "pgm")) { if (sdkLoadPGM<unsigned char>(file, &pixels, &w, &h) != true) { printf("Failed to load PGM image file: %s\n", file); exit(EXIT_FAILURE); } g_Bpp = 1; } else if (!strcmp(&file[file_length-3], "ppm")) { if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { printf("Failed to load PPM image file: %s\n", file); exit(EXIT_FAILURE); } g_Bpp = 4; } else { cudaDeviceReset(); exit(EXIT_FAILURE); } imWidth = (int)w; imHeight = (int)h; setupTexture(imWidth, imHeight, pixels, g_Bpp); // copy function pointer tables to host side for later use setupFunctionTables(); memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); if (!g_bQAReadback) { // use OpenGL Path glGenBuffers(1, &pbo_buffer); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glBufferData(GL_PIXEL_UNPACK_BUFFER, g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, GL_STREAM_DRAW); glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize); cudaDeviceReset(); exit(EXIT_FAILURE); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); // register this buffer object with CUDA checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); glGenTextures(1, &texid); glBindTexture(GL_TEXTURE_2D, texid); glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp==1) ? GL_LUMINANCE : GL_BGRA), imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL); glBindTexture(GL_TEXTURE_2D, 0); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glPixelStorei(GL_PACK_ALIGNMENT, 1); } }
int main(int argc, char **argv) { int N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; float dot; float *r, *p, *Ax; int k; float alpha, beta, alpham1; printf("Starting [%s]...\n", sSDKname); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); if (!deviceProp.managedMemory) { // This samples requires being run on a device that supports Unified Memory fprintf(stderr, "Unified Memory not supported on this device\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_WAIVED); } // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N-2)*3 + 4; cudaMallocManaged((void **)&I, sizeof(int)*(N+1)); cudaMallocManaged((void **)&J, sizeof(int)*nz); cudaMallocManaged((void **)&val, sizeof(float)*nz); genTridiag(I, J, val, N, nz); cudaMallocManaged((void **)&x, sizeof(float)*N); cudaMallocManaged((void **)&rhs, sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // temp memory for CG checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float))); cudaDeviceSynchronize(); for (int i=0; i < N; i++) { r[i] = rhs[i]; } alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax); cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax); cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } printf("Final residual: %e\n",sqrt(r1)); fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaFree(I); cudaFree(J); cudaFree(val); cudaFree(x); cudaFree(rhs); cudaFree(r); cudaFree(p); cudaFree(Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); findCudaGLDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }
int main(int argc, char** argv){ bool loadedScene = false; //for(int i=1; i<argc; i++){ string header; string data; //istringstream liness(argv[i]); //getline(liness, header, '='); getline(liness, data, '='); //if(strcmp(header.c_str(), "mesh")==0){ //renderScene = new scene(data); //laoding file for obj data = "../../objs/plane.obj"; mesh = new obj(); objLoader* loader = new objLoader(data, mesh); mesh->buildVBOs(); meshVector.push_back(mesh); data = "../../objs/bunny.obj"; mesh = new obj(); loader = new objLoader(data, mesh); mesh->buildVBOs(); meshVector.push_back(mesh); delete loader; loadedScene = true; //} //} stencilBuffer = new int[width*height]; #if STENCIL == 1 isStencil = true; #else isStencil = false; #endif firstObj = 0; secondObj = FIRST; if(!loadedScene){ cout << "Usage: mesh=[obj file]" << endl; return 0; } frame = 0; seconds = time (NULL); fpstracker = 0; // Launch CUDA/GL #ifdef __APPLE__ // Needed in OSX to force use of OpenGL3.2 glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3); glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2); glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); init(); #else init(argc, argv); #endif initCuda(); initVAO(); initTextures(); GLuint passthroughProgram; passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl"); glUseProgram(passthroughProgram); glActiveTexture(GL_TEXTURE0); #ifdef __APPLE__ // send into GLFW main loop while(1){ display(); if (glfwGetKey(GLFW_KEY_ESC) == GLFW_PRESS || !glfwGetWindowParam( GLFW_OPENED )){ kernelCleanup(); cudaDeviceReset(); exit(0); } } glfwTerminate(); #else glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMainLoop(); #endif kernelCleanup(); return 0; }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_rnd; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; cublasHandle_t handle; int dev_id; cudaDeviceProp device_prop; bool do_device_api_test = false; float host_api_test_ratio, device_api_test_ratio; /* Initialize CUBLAS */ printf("simpleCUBLAS test running...\n"); dev_id = findCudaDevice(argc, (const char **) argv); checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); if ((device_prop.major << 4) + device_prop.minor >= 0x35) { printf("Host and device APIs will be tested.\n"); do_device_api_test = true; } /* else if ((device_prop.major << 4) + device_prop.minor >= 0x20) { printf("Host API will be tested.\n"); do_device_api_test = false; } */ else { fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_SUCCESS; } status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0])); if (h_C_rnd == 0) { fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C_ref[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C_rnd[i] = rand() / (float)RAND_MAX; h_C[i] = h_C_rnd[i]; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* * Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ host_api_test_ratio = check_result(h_C, h_C_ref, n2); if (do_device_api_test) { /* Reset device resident C matrix */ status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* * Performs operation using the device API of CUBLAS library */ device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C); /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Check result against reference */ device_api_test_ratio = check_result(h_C, h_C_ref, n2); } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_rnd); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return EXIT_FAILURE; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); bool test_result = do_device_api_test ? host_api_test_ratio < 1e-6 && device_api_test_ratio < 1e-6 : host_api_test_ratio < 1e-6; printf("simpleCUBLAS completed, returned %s\n", test_result ? "OK" : "ERROR!"); exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { char *multiMethodChoice = NULL; char *scalingChoice = NULL; bool use_threads = true; bool bqatest = false; bool strongScaling = false; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { bqatest = true; } getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice); getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice); if (checkCmdLineFlag(argc, (const char **)argv, "h") || checkCmdLineFlag(argc, (const char **)argv, "help")) { usage(); exit(EXIT_SUCCESS); } if (multiMethodChoice == NULL) { use_threads = true; } else { if (!strcasecmp(multiMethodChoice, "threaded")) { use_threads = true; } else { use_threads = false; } } if (use_threads == false) { printf("Using single CPU thread for multiple GPUs\n"); } if (scalingChoice == NULL) { strongScaling = false; } else { if (!strcasecmp(scalingChoice, "strong")) { strongScaling = true; } else { strongScaling = false; } } //GPU number present in the system int GPU_N; checkCudaErrors(cudaGetDeviceCount(&GPU_N)); int nOptions = 256; nOptions = adjustProblemSize(GPU_N, nOptions); // select problem size int scale = (strongScaling) ? 1 : GPU_N; int OPT_N = nOptions * scale; int PATH_N = 262144; const unsigned long long SEED = 777; // initialize the timers hTimer = new StopWatchInterface*[GPU_N]; for (int i=0; i<GPU_N; i++) { sdkCreateTimer(&hTimer[i]); sdkResetTimer(&hTimer[i]); } //Input data array TOptionData *optionData = new TOptionData[OPT_N]; //Final GPU MC results TOptionValue *callValueGPU = new TOptionValue[OPT_N]; //"Theoretical" call values by Black-Scholes formula float *callValueBS = new float[OPT_N]; //Solver config TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; //OS thread ID CUTThread *threadID = new CUTThread[GPU_N]; int gpuBase, gpuIndex; int i; float time; double delta, ref, sumDelta, sumRef, sumReserve; printf("MonteCarloMultiGPU\n"); printf("==================\n"); printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); printf("Problem scaling = %s\n", strongScaling? "strong" : "weak"); printf("Number of GPUs = %d\n", GPU_N); printf("Total number of options = %d\n", OPT_N); printf("Number of paths = %d\n", PATH_N); printf("main(): generating input data...\n"); srand(123); for (i=0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for (i = 0; i < GPU_N; i++) { optionSolver[i].optionCount = OPT_N / GPU_N; } //Take into account cases with "odd" option counts for (i = 0; i < (OPT_N % GPU_N); i++) { optionSolver[i].optionCount++; } //Assign GPU option ranges gpuBase = 0; for (i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; // all devices use the same global seed, but start // the sequence at a different offset optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } if (use_threads || bqatest) { //Start CPU thread for each GPU for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); } printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics, threaded\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); time = sdkGetTimerValue(&hTimer[i]); printf("Total time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); } printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } if (!use_threads || bqatest) { multiSolver(optionSolver, GPU_N); printf("main(): GPU statistics, streamed\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } time = sdkGetTimerValue(&hTimer[0]); printf("\nTotal time (ms.): %f\n", time); printf("\tNote: This is elapsed time for all to compute.\n"); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("Shutting down...\n"); for (int i=0; i<GPU_N; i++) { sdkStartTimer(&hTimer[i]); checkCudaErrors(cudaSetDevice(i)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); } delete[] optionSolver; delete[] callValueBS; delete[] callValueGPU; delete[] optionData; delete[] threadID; delete[] hTimer; printf("Test Summary...\n"); printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); }
static int cutorch_deviceReset(lua_State *L) { THCudaCheck(cudaDeviceReset()); return 0; }
//----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- VOID Cleanup() { if (g_histogram.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_histogram.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource failed"); g_histogram.pBuffer->Release(); } if (g_histogram.pBufferSRV != NULL) { g_histogram.pBufferSRV->Release(); } if (g_pDisplayEffect != NULL) { g_pDisplayEffect->Release(); } if (g_pCompositeEffect != NULL) { g_pCompositeEffect->Release(); } if (g_color.pBufferSRV != NULL) { g_color.pBufferSRV->Release(); } if (g_color.pBufferRTV != NULL) { g_color.pBufferRTV->Release(); } if (g_color.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_color.cudaResource); getLastCudaError("cudaD3D10UnregisterResource failed"); g_color.pBuffer->Release(); } if (g_pRasterState != NULL) { g_pRasterState->Release(); } if (g_pSwapChainRTV != NULL) { g_pSwapChainRTV->Release(); } if (g_pSwapChain != NULL) { g_pSwapChain->Release(); } // Uninitialize CUDA cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); if (g_pd3dDevice != NULL) { g_pd3dDevice->Release(); } }
void initializeData(char *file) { GLint bsize; unsigned int w, h; size_t file_length= strlen(file); if (!strcmp(&file[file_length-3], "pgm")) { if (sdkLoadPGM<unsigned char>(file, &pixels, &w, &h) != true) { printf("Failed to load PGM image file: %s\n", file); exit(EXIT_FAILURE); } g_Bpp = 1; } else if (!strcmp(&file[file_length-3], "ppm")) { if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { printf("Failed to load PPM image file: %s\n", file); exit(EXIT_FAILURE); } g_Bpp = 4; } else { // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); } imWidth = (int)w; imHeight = (int)h; setupTexture(imWidth, imHeight, pixels, g_Bpp); memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); if (!g_bQAReadback) { // use OpenGL Path glGenBuffers(1, &pbo_buffer); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); glBufferData(GL_PIXEL_UNPACK_BUFFER, g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, GL_STREAM_DRAW); glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); // register this buffer object with CUDA checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); glGenTextures(1, &texid); glBindTexture(GL_TEXTURE_2D, texid); glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp==1) ? GL_LUMINANCE : GL_BGRA), imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL); glBindTexture(GL_TEXTURE_2D, 0); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glPixelStorei(GL_PACK_ALIGNMENT, 1); } }
//----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- VOID Cleanup() { if (g_histogram.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_histogram.cudaResource); getLastCudaError("cudaGraphicsUnregisterResource failed"); g_histogram.pBuffer->Release(); } if (g_histogram.pBufferSRV != NULL) { g_histogram.pBufferSRV->Release(); } if (g_pDisplayEffect != NULL) { g_pDisplayEffect->Release(); } if (g_pCompositeEffect != NULL) { g_pCompositeEffect->Release(); } if (g_color.pBufferSRV != NULL) { g_color.pBufferSRV->Release(); } if (g_color.pBufferRTV != NULL) { g_color.pBufferRTV->Release(); } if (g_color.pBuffer != NULL) { // Unregister vertex buffer cudaGraphicsUnregisterResource(g_color.cudaResource); getLastCudaError("cudaD3D10UnregisterResource failed"); g_color.pBuffer->Release(); } if (g_pRasterState != NULL) { g_pRasterState->Release(); } if (g_pSwapChainRTV != NULL) { g_pSwapChainRTV->Release(); } if (g_pSwapChain != NULL) { g_pSwapChain->Release(); } // Uninitialize CUDA // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); getLastCudaError("cudaDeviceReset failed"); if (g_pd3dDevice != NULL) { g_pd3dDevice->Release(); } }
void CUDAManager::init() { //cudaDeviceReset(); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = get_max_multiprocessor_cuda_device(); if (devID < 0) { UG_THROW("no CUDA device found.\n"); } CUDA_CHECK_SUCCESS(cudaSetDevice(devID), "setting up device " << devID); CUDA_CHECK_STATUS(cudaGetDeviceProperties(&deviceProp, devID)); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if (version < 0x11) { cudaDeviceReset(); UG_THROW("Requires a minimum CUDA compute 1.1 capability\n"); } m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; UG_DLOG(DID_CUDA, 0, "CUDA Initialized:" "\n - CUDA Device '" << deviceProp.name << "': " << "\n - Total global Memory: " << deviceProp.totalGlobalMem/(1024*1024*1024.0) << " GB" "\n - Shared Mem per Block: " << deviceProp.sharedMemPerBlock/(1024.0) << " KB" "\n - Regs per Block: " << deviceProp.regsPerBlock << "\n - Warp Size: " << deviceProp.warpSize << "\n - Maximum Number of Threads per Block: " << deviceProp.maxThreadsPerBlock << "\n - Max Thread Dim: (" << deviceProp.maxThreadsDim[0] << ", " << deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << "\n - Max Grid Size: (" << deviceProp.maxGridSize[0] << ", " << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << "\n - Clock Rate: " << deviceProp.clockRate/1000.0 << " Mhz" "\n - Total Const Mem: " << deviceProp.totalConstMem/(1024.0) << " KB" "\n - Compute Capability: " << deviceProp.major << "." << deviceProp.minor << "\n - Number of multiprocessors: " << deviceProp.multiProcessorCount << "\n - Maximum Texture Size 1D: " << deviceProp.maxTexture1D << "\n - Maximum Texture Size 2D: " << deviceProp.maxTexture2D[0] << " x " << deviceProp.maxTexture2D[1] << "\n - Maximum Texture Size 3D: " << deviceProp.maxTexture3D[0] << " x " << deviceProp.maxTexture3D[1] << " x " << deviceProp.maxTexture3D[2] << "\n - Memory Clock Rate: " << deviceProp.memoryClockRate/(1000000.0)<< " Mhz" "\n - Memory Bus Width: " << deviceProp.memoryBusWidth << "\n - L2 Cache Size: " << deviceProp.l2CacheSize << "\n - Max Threads per multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"); /* Get handle to the CUBLAS context */ cublasHandle = 0; cublasStatus_t cublasStatus = cublasCreate(&cublasHandle); CUDA_CHECK_STATUS(cublasStatus); #ifdef USE_CUSPARSE /* Get handle to the CUSPARSE context */ cusparseHandle = 0; cusparseStatus_t cusparseStatus = cusparseCreate(&cusparseHandle); CUDA_CHECK_STATUS(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); CUDA_CHECK_STATUS(cusparseStatus); #endif get_temp_buffer<double>(1024); m_tempRetBuffer = MyCudaAlloc<double>(4); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice); if (precisionChoice == NULL) { useDoublePrecision = 0; } else { if (!STRCASECMP(precisionChoice, "double")) { useDoublePrecision = 1; } else { useDoublePrecision = 0; } } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s int dev = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); int deviceIndex; checkCudaErrors(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if (useDoublePrecision && version < 13) { printf("Double precision not supported.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return 0; } printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if (useDoublePrecision) { initTable_SM13(tableCPU); } else { initTable_SM10(tableCPU); } printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { quasirandomGenerator_SM13(d_Output, 0, N); } else { quasirandomGenerator_SM10(d_Output, 0, N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); } else { inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A)); checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B)); checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C)); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { const float alpha = 1.0f; const float beta = 0.0f; cublasHandle_t handle; cudaEvent_t start, stop; checkCudaErrors(cublasCreate(&handle)); //Perform warmup operation with cublas checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); // Allocate CUDA events that we'll use for timing checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Record the start event checkCudaErrors(cudaEventRecord(start, NULL)); for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); } printf("done.\n"); // Record the stop event checkCudaErrors(cudaEventRecord(stop, NULL)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // copy result from device to host checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); // Destroy the handle checkCudaErrors(cublasDestroy(handle)); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
int main(int argc, char* argv[]) { if (argc!=3) { printf("Argument error!\n"); printf("1) number of FFTs to be calculated\n"); printf("2) number of kernel runs\n"); return 1; } char * pEnd; int nSamples = FFT_LENGTH; int nSpectra = strtol(argv[1],&pEnd,10); int nRuns = strtol(argv[2],&pEnd,10); int input_size=nSpectra*nSamples; int output_size=nSpectra*nSamples; if(nSamples<32) output_size=nSpectra*32; if (DEBUG) printf("\t\tWelcome\n"); float2 *h_input; float2 *h_output; float *h_temp; if (DEBUG) printf("\nHost memory allocation...\t"); h_input = (float2 *)malloc(input_size*sizeof(float2)); h_output = (float2 *)malloc(output_size*sizeof(float2)); h_temp = (float *)malloc(output_size*sizeof(float)); if (DEBUG) printf("done."); if (DEBUG) printf("\nHost memory memset...\t\t"); memset(h_input, 0.0, input_size*sizeof(float2)); memset(h_output, 0.0, output_size*sizeof(float2)); memset(h_temp, 0.0, output_size*sizeof(float)); if (DEBUG) printf("done."); if (DEBUG) printf("\nRandom data set...\t\t"); Generate_signal(h_temp,nSamples); srand(time(NULL)); for(int f=0;f<nSamples*nSpectra;f++){ h_input[f].y=rand() / (float)RAND_MAX; h_input[f].x=rand() / (float)RAND_MAX; } if (DEBUG) printf("done."); GPU_FFT(h_input, h_output, nSamples, nSpectra, nRuns); if (DEBUG && CHECK){ #ifdef CHECK_USING_FFTW double cumulative_error,mean_error; Full_FFT_check(h_output, h_input, nSamples, nSpectra, &cumulative_error, &mean_error); printf("Cumulative Error: %e, Mean Error: %e;\n",cumulative_error,mean_error); #endif } delete[] h_input; delete[] h_output; delete[] h_temp; cudaDeviceReset(); if (DEBUG) printf("\nFinished!\n"); return (0); }
/* * main should only control threads * * the threads should be invoked on different cores: * http://stackoverflow.com/questions/1407786/how-to-set-cpu-affinity-of-a-particular-pthread * https://www.google.pl/search?client=ubuntu&channel=fs&q=how+to+schedule+pthreads+through+cores&ie=utf-8&oe=utf-8&gfe_rd=cr&ei=PSudVePFOqeA4AShra2AAQ */ int main() { cudaDeviceReset(); cudaDeviceSynchronize(); // print device properties print_device(); // create pointers to data const uint64_t size = N; double complex* data_r_host = NULL; // initializing with NULL for debuging purposes double complex* data_k_host = NULL; // initializing with NULL for debuging purposes DataArray* data_arr_ptr = (DataArray*) malloc((size_t) sizeof(DataArray)); // change to global variable <- easier to code create_data_arr(data_arr_ptr, &data_r_host, &data_k_host, size); // allocate memory for array of streams const uint8_t num_streams = 2; // rewrite on defines? streams_arr = (cudaStream_t*) malloc( (size_t) sizeof(cudaStream_t)*num_streams); // create threads const uint8_t num_threads = 2; printf("host thread id\t %u\ndevice thread id %u\n",HOST_THRD, DEVICE_THRD); pthread_t* thread_ptr_arr = (pthread_t*) malloc( (size_t) sizeof(pthread_t)*num_threads ); // alternatively pthread_t* thread_ptr_arr[num_threads]; // init barier for threads pthread_barrier_init (&barrier, NULL, num_threads); // last number tells how many threads should be synchronized by this barier pthread_create(&thread_ptr_arr[HOST_THRD], NULL, host_thread, (void*) data_arr_ptr); pthread_create(&thread_ptr_arr[DEVICE_THRD], NULL, device_thread, (void*) data_arr_ptr); // for (uint8_t ii = 0; ii < num_threads; ii++) { // pthread_create(thread_ptr_arr[ii], NULL, host_thread, (void*) data_arr_ptr); // } //cudaStream_t stream1; //cudaStream_t stream2; //cudaStream_t* streams_arr[2] = {&stream1, &stream2}; void* status; pthread_join(thread_ptr_arr[HOST_THRD], &status); pthread_join(thread_ptr_arr[DEVICE_THRD], &status); printf("data visible in main thread:\n"); /*for (uint64_t ii=0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf( "%lu.\t",ii ); printf( "%lf + %lf\t", creal(data_r_host[ii]), cimag(data_r_host[ii]) ); printf( "%lf + %lf\n", creal(data_k_host[ii]), cimag(data_k_host[ii]) ); }*/ free(thread_ptr_arr); free(streams_arr); free_data_arr(data_arr_ptr); cudaDeviceSynchronize(); free(data_arr_ptr); cudaThreadExit(); cudaDeviceSynchronize(); printf("Main: program completed. Exiting...\n"); return EXIT_SUCCESS; }
int main(int argc, char **argv) { char *dump_file = NULL; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); glutDisplayFunc(displayFunc); glutKeyboardFunc(shutDown); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; numIterations = 0; if (argc > 1) { if (checkCmdLineFlag(argc, (const char **) argv, "n")) { numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "grid")) { gridDim = getCmdLineArgumentInt(argc, (const char **) argv, "grid"); } if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); fpsLimit = frameCheckNumber; numIterations = 1; } } gridSize.x = gridSize.y = gridSize.z = gridDim; printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x*gridSize.y*gridSize.z); printf("particles: %d\n", numParticles); bool benchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark") != 0; if (checkCmdLineFlag(argc, (const char **) argv, "i")) { numIterations = getCmdLineArgumentInt(argc, (const char **) argv, "i"); } if (g_refFile) { cudaInit(argc, argv); } else { if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -file=<*.bin>\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } initGL(&argc, argv); cudaGLInit(argc, argv); } initParticleSystem(numParticles, gridSize, g_refFile==NULL); initParams(); if (!g_refFile) { initMenus(); } if (benchmark || g_refFile) { if (numIterations <= 0) { numIterations = 300; } runBenchmark(numIterations, argv[0]); } else { glutDisplayFunc(display); glutReshapeFunc(reshape); glutMouseFunc(mouse); glutMotionFunc(motion); glutKeyboardFunc(key); glutSpecialFunc(special); glutIdleFunc(idle); glutCloseFunc(cleanup); glutMainLoop(); } if (psystem) { delete psystem; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); }