void loadImageData(int argc, char **argv) { // load image (needed so we can get the width and height before we create the window char *image_path = NULL; if (argc >= 1) { image_path = sdkFindFilePath(image_filename, argv[0]); } if (image_path == NULL) { fprintf(stderr, "Error finding image file '%s'\n", image_filename); exit(EXIT_FAILURE); } LoadBMPFile((uchar4 **)&hImage, &width, &height, image_path); if (!hImage) { fprintf(stderr, "Error opening file '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); }
void loadImageData(int argc, char **argv) { // load image (needed so we can get the width and height before we create the window char *image_path = NULL; if (argc >= 1) { image_path = sdkFindFilePath(image_filename, argv[0]); } if (image_path == 0) { printf("Error finding image file '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4(image_path, (unsigned char **) &h_img, &width, &height); if (!h_img) { printf("Error opening file '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); }
void runAutoTest(const char *ref_file, char *exec_path) { checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4)); // render the volumeData render_kernel(gridSize, blockSize, d_output, width, height, w); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("render_kernel failed"); void *h_output = malloc(width*height*sizeof(GLubyte)*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost)); sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin"); bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); checkCudaErrors(cudaFree(d_output)); free(h_output); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); sdkStopTimer(&timer); sdkDeleteTimer(&timer); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
void runAutoTest(int argc, char **argv, const char *filename, int kernel_param) { printf("[%s] - (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); TColor *d_dst = NULL; unsigned char *h_dst = NULL; checkCudaErrors(cudaMalloc((void **)&d_dst, imageW*imageH*sizeof(TColor))); h_dst = (unsigned char *)malloc(imageH*imageW*4); { g_Kernel = kernel_param; printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]); checkCudaErrors(CUDA_Bind2TextureArray()); runImageFilters(d_dst); checkCudaErrors(CUDA_UnbindTexture()); checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW*imageH*sizeof(TColor), cudaMemcpyDeviceToHost)); sdkSavePPM4ub(filename, h_dst, imageW, imageH); } checkCudaErrors(CUDA_FreeArray()); free(h_Src); checkCudaErrors(cudaFree(d_dst)); free(h_dst); printf("\n[%s] -> Kernel %d, Saved: %s\n", sSDKsample, kernel_param, filename); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); }
// This test specifies a single test (where you specify radius and/or iterations) int runSingleTest(char *ref_file, char *exec_path) { int nTotalErrors = 0; char dump_file[256]; printf("[runSingleTest]: [%s]\n", sSDKsample); initCuda(); unsigned int *dResult; unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); // run the sample radius { printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // check if kernel execution generated an error getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); // readback the results to system memory cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch, sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost); sprintf(dump_file, "nature_%02d.ppm", filter_radius); sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { printf("Image is Different "); nTotalErrors++; } else { printf("Image is Matching "); } printf(" <%s>\n", ref_file); } printf("\n"); free(hResult); checkCudaErrors(cudaFree(dResult)); return nTotalErrors; }
bool CheckRenderD3D10::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path, const float epsilon, const float threshold) { char *ref_file_path = sdkFindFilePath(ref_file, exec_path); if (ref_file_path == NULL) { printf("CheckRenderD3D10::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILURE!\n"); return false; } return (sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true); }
/* Modified utility function taken from CUDA samples */ void loadDefaultImage(char *loc_exec) { printf("Reading image... \n"); const char *image_filename = "lena.pgm"; char *image_path = sdkFindFilePath(image_filename, loc_exec); if (image_path == NULL) { printf("Failed to read image file: <%s>\n", image_filename); exit(EXIT_FAILURE); } initializeData(image_path); free(image_path); }
bool CheckRender::PGMvsPGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { unsigned char *src_data = NULL, *ref_data = NULL; unsigned long error_count = 0; unsigned int width, height; char *ref_file_path = sdkFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf("CheckRender::PGMvsPGM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILED\n"); error_count++; } else { if (src_file == NULL || ref_file_path == NULL) { printf("PGMvsPGM: Aborting comparison\n"); return false; } printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); if (sdkLoadPPMub(ref_file_path, &ref_data, &width, &height) != true) { printf("PGMvsPGM: unable to load ref image file: %s\n", ref_file_path); return false; } if (sdkLoadPPMub(src_file, &src_data, &width, &height) != true) { printf("PGMvsPGM: unable to load src image file: %s\n", src_file); return false; } printf("PGMvsPGM: comparing images size (%d,%d) epsilon(%2.4f), threshold(%4.2f%%)\n", m_Height, m_Width, epsilon, threshold*100); if (compareDataAsFloatThreshold<unsigned char, float>( ref_data, src_data, m_Height*m_Width, epsilon, threshold ) == false) { error_count = 1; } } if (error_count == 0) { printf(" OK\n"); } else { printf(" FAILURE: %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
void loadVolumeData(char *exec_path) { // load volume data const char *path = sdkFindFilePath(volumeFilename, exec_path); if (path == NULL) { fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth; uchar *h_volume = loadRawFile(path, size); initCuda(h_volume, volumeSize); sdkCreateTimer(&timer); free(h_volume); }
bool runSingleTest(const char *ref_file, const char *exec_path) { // allocate memory for result int nTotalErrors = 0; unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer); unsigned char *h_result = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); char dump_file[1024]; sprintf(dump_file, "lena_%02d.ppm", (int)sigma); sdkSavePPM4ub(dump_file, h_result, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { nTotalErrors++; } printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); free(h_result); printf("Summary: %d errors!\n", nTotalErrors); printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); return (nTotalErrors == 0); }
/////////////////////////////////////////////////////////////////////////////// /// \brief /// load 4-channel unsigned byte image /// and convert it to single channel FP32 image /// \param[out] img_data pointer to raw image data /// \param[out] img_w image width /// \param[out] img_h image height /// \param[out] img_s image row stride /// \param[in] name image file name /// \param[in] exePath executable file path /// \return true if image is successfully loaded or false otherwise /////////////////////////////////////////////////////////////////////////////// bool LoadImageAsFP32(float *&img_data, int &img_w, int &img_h, int &img_s, const char *name, const char *exePath) { printf("Loading \"%s\" ...\n", name); char *name_ = sdkFindFilePath(name, exePath); if (!name_) { printf("File not found\n"); return false; } unsigned char *data = 0; unsigned int w = 0, h = 0; bool result = sdkLoadPPM4ub(name_, &data, &w, &h); if (result == false) { printf("Invalid file format\n"); return false; } img_w = w; img_h = h; img_s = iAlignUp(img_w); img_data = new float [img_s * h]; // source is 4 channel image const int widthStep = 4 * img_w; for (int i = 0; i < img_h; ++i) { for (int j = 0; j < img_w; ++j) { img_data[j + i * img_s] = ((float) data[j * 4 + i * widthStep]) / 255.0f; } } return true; }
void loadImageData(int argc, char **argv) { // load image from disk uchar *h_data = NULL; char *srcImagePath = NULL; if ((srcImagePath = sdkFindFilePath(srcImageFilename, argv[0])) == NULL) { printf("bicubicTexture loadImageData() could not find <%s>\nExiting...\n", srcImageFilename); exit(EXIT_FAILURE); } sdkLoadPGM<unsigned char>(srcImagePath, &h_data, &imageWidth, &imageHeight); printf("Loaded '%s', %d x %d pixels\n", srcImageFilename, imageWidth, imageHeight); cx = imageWidth * 0.5f; cy = imageHeight * 0.5f; // initialize texture initTexture(imageWidth, imageHeight, h_data); }
bool CheckRender::PPMvsPPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold ) { unsigned long error_count = 0; char *ref_file_path = sdkFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf("CheckRender::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, m_ExecPath); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILED\n"); error_count++; } if (src_file == NULL || ref_file_path == NULL) { printf("PPMvsPPM: Aborting comparison\n"); return false; } printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); return (sdkComparePPM( src_file, ref_file_path, epsilon, threshold, true ) == true ? true : false); }
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath = sdkFindFilePath("Lena.pgm", argv[0]); if (filePath) { sFilename = filePath; } else { printf("Error unable to find Lena.pgm\n"); exit(EXIT_FAILURE); } // set your own FreeImage error handler FreeImage_SetOutputMessage(FreeImageErrorHandler); cudaDeviceInit(argc, (const char **)argv); // Min spec is SM 1.0 devices if (printfNPPinfo(argc, argv, 1, 0) == false) { cudaDeviceReset(); exit(EXIT_SUCCESS); } if (argc > 1) { sFilename = argv[1]; } // if we specify the filename at the command line, then we only test sFilename // otherwise we will check both sFilename[0,1] int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_boxFilterFII.pgm"; if (argc >= 3) { sResultFilename = argv[2]; } FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); // no signature? try to guess the file format from the file extension if (eFormat == FIF_UNKNOWN) { eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); } NPP_ASSERT(eFormat != FIF_UNKNOWN); // check that the plugin has reading capabilities ... FIBITMAP *pBitmap; if (FreeImage_FIFSupportsReading(eFormat)) { pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); } NPP_ASSERT(pBitmap != 0); // Dump the bitmap information to the console std::cout << (*pBitmap) << std::endl; // make sure this is an 8-bit single channel image NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); unsigned char *pSrcData = FreeImage_GetBits(pBitmap); int nSrcPitchCUDA; Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); NPP_ASSERT_NOT_NULL(pSrcImageCUDA); // copy image loaded via FreeImage to into CUDA device memory, i.e. // transfer the image-data up to the GPU's video-memory NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, nImageWidth, nImageHeight, cudaMemcpyHostToDevice)); // define size of the box filter const NppiSize oMaskSize = {7, 7}; const NppiPoint oMaskAchnor = {0, 0}; // compute maximal result image size const NppiSize oSizeROI = {nImageWidth - (oMaskSize.width - 1), nImageHeight - (oMaskSize.height - 1) }; // allocate result image memory int nDstPitchCUDA; Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); NPP_ASSERT_NOT_NULL(pDstImageCUDA); NPP_CHECK_NPP(nppiFilterBox_8u_C1R(pSrcImageCUDA, nSrcPitchCUDA, pDstImageCUDA, nDstPitchCUDA, oSizeROI, oMaskSize, oMaskAchnor)); // create the result image storage using FreeImage so we can easily // save FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); NPP_ASSERT_NOT_NULL(pResultBitmap); unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, nDstPitchCUDA, oSizeROI.width, oSizeROI.height, cudaMemcpyDeviceToHost)); // now save the result image bool bSuccess; bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE; NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); //free nppiImage nppiFree(pSrcImageCUDA); nppiFree(pDstImageCUDA); cudaDeviceReset(); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); }
inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) { unsigned int *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; unsigned long error_count = 0; size_t fsize = 0; if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n", src_file); error_count++; } char *ref_file_path = sdkFindFilePath(ref_file, exec_path); if (ref_file_path == NULL) { printf("compareBin2Bin <unsigned int> unable to find <%s> in <%s>\n", ref_file, exec_path); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); printf("Aborting comparison!\n"); printf(" FAILED\n"); error_count++; if (src_fp) { fclose(src_fp); } if (ref_fp) { fclose(ref_fp); } } else { if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { printf("compareBin2Bin <unsigned int> unable to open ref_file: %s\n", ref_file_path); error_count++; } if (src_fp && ref_fp) { src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); printf("> compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } fclose(src_fp); fclose(ref_fp); free(src_buffer); free(ref_buffer); } else { if (src_fp) { fclose(src_fp); } if (ref_fp) { fclose(ref_fp); } } } if (error_count == 0) { printf(" OK\n"); } else { printf(" FAILURE: %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
int main(int argc, char **argv) { char *dump_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", sSDKsample); if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", (char **) &dump_file); int kernel = 1; if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); } runAutoTest(argc, argv, dump_file, kernel); } else { printf("[%s]\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (checkCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(EXIT_SUCCESS); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf("imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId()); checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); initOpenGLBuffers(); } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is active\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); glutMainLoop(); }
extern "C" void binomialOptionsGPU( real *callValue, TOptionData *optionData, int optN, int argc, char **argv ) { if (!moduleLoaded) { kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]); compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); module = loadPTX(ptx, argc, argv); moduleLoaded = true; } __TOptionData h_OptionData[MAX_OPTIONS]; for (int i = 0; i < optN; i++) { const real T = optionData[i].T; const real R = optionData[i].R; const real V = optionData[i].V; const real dt = T / (real)NUM_STEPS; const real vDt = V * sqrt(dt); const real rDt = R * dt; //Per-step interest and discount factors const real If = exp(rDt); const real Df = exp(-rDt); //Values and pseudoprobabilities of upward and downward moves const real u = exp(vDt); const real d = exp(-vDt); const real pu = (If - d) / (u - d); const real pd = (real)1.0 - pu; const real puByDf = pu * Df; const real pdByDf = pd * Df; h_OptionData[i].S = (real)optionData[i].S; h_OptionData[i].X = (real)optionData[i].X; h_OptionData[i].vDt = (real)vDt; h_OptionData[i].puByDf = (real)puByDf; h_OptionData[i].pdByDf = (real)pdByDf; } CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel")); CUdeviceptr d_OptionData; checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData")); checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); dim3 cudaBlockSize(128,1,1); dim3 cudaGridSize(optN, 1, 1); checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ NULL, /* arguments */ 0)); checkCudaErrors(cuCtxSynchronize()); CUdeviceptr d_CallValue; checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue")); checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real))); }
////////////////////////////////////////////////////////////////////////// // AUTOMATIC TESTING void runSingleTest(const char *ref_file, const char *exec_path) { uint *d_output; checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(uint))); checkCudaErrors(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); filterAnimation = false; // Start timer 0 and process n loops on the GPU int nIter = 10; float scale = 2.0f/float(nIter-1); for (int i = -1; i < nIter; i++) { if (i == 0) { cudaDeviceSynchronize(); sdkStartTimer(&timer); } filterFactor = (float(i) * scale) - 1.0f; filterFactor = -filterFactor; filter(); VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaDeviceSynchronize(); sdkStopTimer(&timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = sdkGetTimerValue(&timer)/(nIter * 1000.0); printf("volumeFiltering, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); getLastCudaError("Error: kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_output = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*4, cudaMemcpyDeviceToHost)); sdkSavePPM4ub("volumefilter.ppm", h_output, width, height); bool bTestResult = sdkComparePPM("volumefilter.ppm", sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, true); checkCudaErrors(cudaFree(d_output)); free(h_output); cleanup(); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
void initData(int argc, char **argv) { // parse arguments char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { volumeFilename = filename; } int n; if (checkCmdLineFlag(argc, (const char **) argv, "size")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "size"); volumeSize.width = volumeSize.height = volumeSize.depth = n; } if (checkCmdLineFlag(argc, (const char **) argv, "xsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize"); volumeSize.width = n; } if (checkCmdLineFlag(argc, (const char **) argv, "ysize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize"); volumeSize.height = n; } if (checkCmdLineFlag(argc, (const char **) argv, "zsize")) { n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize"); volumeSize.depth = n; } char *path = sdkFindFilePath(volumeFilename, argv[0]); if (path == 0) { printf("Error finding file '%s'\n", volumeFilename); exit(EXIT_FAILURE); } size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType); void *h_volume = loadRawFile(path, size); FilterKernel_init(); Volume_init(&volumeOriginal,volumeSize, h_volume, 0); free(h_volume); Volume_init(&volumeFilter0, volumeSize, NULL, 1); Volume_init(&volumeFilter1, volumeSize, NULL, 1); VolumeRender_init(); VolumeRender_setPreIntegrated(preIntegrated); VolumeRender_setVolume(&volumeOriginal); sdkCreateTimer(&timer); sdkCreateTimer(&animationTimer); sdkStartTimer(&animationTimer); // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); }
int main(int argc, char **argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); //'h_' prefix - CPU (host) memory space float //Results calculated by CPU for reference *h_CallResultCPU, *h_PutResultCPU, //CPU copy of GPU results *h_CallResultGPU, *h_PutResultGPU, //CPU instance of input data *h_StockPrice, *h_OptionStrike, *h_OptionYears; //'d_' prefix - GPU (device) memory space CUdeviceptr //Results calculated by GPU d_CallResult, d_PutResult, //GPU instance of input data d_StockPrice, d_OptionStrike, d_OptionYears; double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; int i; sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); h_CallResultCPU = (float *)malloc(OPT_SZ); h_PutResultCPU = (float *)malloc(OPT_SZ); h_CallResultGPU = (float *)malloc(OPT_SZ); h_PutResultGPU = (float *)malloc(OPT_SZ); h_StockPrice = (float *)malloc(OPT_SZ); h_OptionStrike = (float *)malloc(OPT_SZ); h_OptionYears = (float *)malloc(OPT_SZ); char *ptx, *kernel_file; size_t ptxSize; kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); // Set a Compiler Option to have maximum register to be used by each thread. char *compile_options[1]; compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16"))); strcpy((char *)compile_options[0],"--maxrregcount=16"); // Compile the kernel BlackScholes_kernel. compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize); CUmodule module = loadPTX(ptx, argc, argv); CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); printf("...allocating GPU memory for options.\n"); checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); printf("...generating input data in CPU mem.\n"); srand(5347); //Generate options set for (i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); h_OptionStrike[i] = RandFloat(1.0f, 100.0f); h_OptionYears[i] = RandFloat(0.25f, 10.0f); } printf("...copying input data to GPU mem.\n"); //Copy options data to GPU memory for further processing checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); printf("Data init done.\n\n"); printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); dim3 cudaBlockSize( 128, 1, 1); dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1); float risk = RISKFREE; float volatility = VOLATILITY; int optval = OPT_N; void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice, (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval }; for (i = 0; i < NUM_ITERATIONS; i++) { checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); } checkCudaErrors(cuCtxSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("BlackScholesGPU() time : %f msec\n", gpuTime); printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); printf("\nReading back GPU results...\n"); //Read back GPU results to compare them to CPU results checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); printf("Checking the results...\n"); printf("...running CPU calculations.\n\n"); //Calculate options values on CPU BlackScholesCPU( h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N ); printf("Comparing the results...\n"); //Calculate max absolute difference and L1 distance //between CPU and GPU results sum_delta = 0; sum_ref = 0; max_delta = 0; for (i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); if (delta > max_delta) { max_delta = delta; } sum_delta += delta; sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); printf("Shutting down...\n"); printf("...releasing GPU memory.\n"); checkCudaErrors(cuMemFree(d_OptionYears)); checkCudaErrors(cuMemFree(d_OptionStrike)); checkCudaErrors(cuMemFree(d_StockPrice)); checkCudaErrors(cuMemFree(d_PutResult)); checkCudaErrors(cuMemFree(d_CallResult)); printf("...releasing CPU memory.\n"); free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); free(h_PutResultGPU); free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); sdkDeleteTimer(&hTimer); printf("Shutdown done.\n"); printf("\n[%s] - Test Summary\n", argv[0]); cuProfilerStop(); if (L1norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
void parseCommandLineArguments(int argc, char *argv[]) { char video_file[256]; printf("Command Line Arguments:\n"); for (int n=0; n < argc; n++) { printf("argv[%d] = %s\n", n, argv[n]); } if (checkCmdLineFlag(argc, (const char **)argv, "help")) { displayHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "decodecuda")) { g_eVideoCreateFlags = cudaVideoCreate_PreferCUDA; } if (checkCmdLineFlag(argc, (const char **)argv, "decodedxva")) { g_eVideoCreateFlags = cudaVideoCreate_PreferDXVA; } if (checkCmdLineFlag(argc, (const char **)argv, "decodecuvid")) { g_eVideoCreateFlags = cudaVideoCreate_PreferCUVID; } if (checkCmdLineFlag(argc, (const char **)argv, "vsync")) { g_bUseVsync = true; } if (checkCmdLineFlag(argc, (const char **)argv, "novsync")) { g_bUseVsync = false; } if (checkCmdLineFlag(argc, (const char **)argv, "repeatframe")) { g_bFrameRepeat = true; } if (checkCmdLineFlag(argc, (const char **)argv, "framestep")) { g_bFrameStep = true; g_bUseDisplay = true; g_bUseInterop = true; g_fpsLimit = 1; } if (checkCmdLineFlag(argc, (const char **)argv, "updateall")) { g_bUpdateAll = true; } if (checkCmdLineFlag(argc, (const char **)argv, "displayvideo")) { g_bUseDisplay = true; g_bUseInterop = true; } if (checkCmdLineFlag(argc, (const char **)argv, "nointerop")) { g_bUseInterop = false; } if (checkCmdLineFlag(argc, (const char **)argv, "readback")) { g_bReadback = true; } if (checkCmdLineFlag(argc, (const char **)argv, "device")) { g_DeviceID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); g_bUseDisplay = true; g_bUseInterop = true; } if (g_bUseDisplay == false) { g_bQAReadback = true; g_bUseInterop = false; } if (g_bLoop == false) { g_bAutoQuit = true; } // Search all command file parameters for video files with extensions: // mp4, avc, mkv, 264, h264. vc1, wmv, mp2, mpeg2, mpg char *file_ext = NULL; for (int i=1; i < argc; i++) { if (getFileExtension(argv[i], &file_ext) > 0) { strcpy(video_file, argv[i]); break; } } // We load the default video file for the SDK sample if (file_ext == NULL) { strcpy(video_file, sdkFindFilePath(VIDEO_SOURCE_FILE, argv[0])); } // Now verify the video file is legit FILE *fp = fopen(video_file, "r"); if (video_file == NULL && fp == NULL) { printf("[%s]: unable to find file: <%s>\nExiting...\n", sAppFilename, VIDEO_SOURCE_FILE); exit(EXIT_FAILURE); } if (fp) { fclose(fp); } // default video file loaded by this sample sFileName = video_file; // store the current path so we can reinit the CUDA context strcpy(exec_path, argv[0]); printf("[%s]: input file: <%s>\n", sAppFilename, video_file); }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; char *ref_file = NULL; #if defined(__linux__) setenv ("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKsample); printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if (argc > 1) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); fpsLimit = frameCheckNumber; } } // Get the path of the filename char *filename; if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) { image_filename = filename; } // load image char *image_path = sdkFindFilePath(image_filename, argv[0]); if (image_path == NULL) { fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); exit(EXIT_FAILURE); } sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); if (!h_img) { printf("Error unable to load PPM file: '%s'\n", image_path); exit(EXIT_FAILURE); } printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); } runBenchmark = checkCmdLineFlag(argc, (const char **) argv, "benchmark"); int device; struct cudaDeviceProp prop; cudaGetDevice(&device); cudaGetDeviceProperties(&prop, device); if (!strncmp("Tesla", prop.name, 5)) { printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n"); // runBenchmark = true; runBenchmark = true; } // Benchmark or AutoTest mode detected, no OpenGL if (runBenchmark == true || ref_file != NULL) { findCudaDevice(argc, (const char **)argv); } else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL(&argc, argv); findCudaGLDevice(argc, (const char **)argv); } initCudaBuffers(); if (ref_file) { printf("(Automated Testing)\n"); bool testPassed = runSingleTest(ref_file, argv[0]); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); } if (runBenchmark) { printf("(Run Benchmark)\n"); benchmark(100); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } initGLBuffers(); glutMainLoop(); exit(EXIT_SUCCESS); }
bool CheckRender::compareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold) { float *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; size_t fsize = 0; unsigned long error_count = 0; if ((src_fp = fopen(src_file, "rb")) == NULL) { printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file); error_count = 1; } char *ref_file_path = sdkFindFilePath(ref_file, m_ExecPath); if (ref_file_path == NULL) { printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, m_ExecPath); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", m_ExecPath); printf("Aborting comparison!\n"); printf(" FAILED\n"); error_count++; if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } else { if ((ref_fp = fopen(ref_file_path, "rb")) == NULL) { printf("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path); error_count = 1; } if (src_fp && ref_fp) { src_buffer = (float *)malloc(nelements*sizeof(float)); ref_buffer = (float *)malloc(nelements*sizeof(float)); fsize = fread(src_buffer, nelements, sizeof(float), src_fp); fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp); printf("> compareBin2Bin <float> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); printf(" src_file <%s>\n", src_file); printf(" ref_file <%s>\n", ref_file_path); if (!compareDataAsFloatThreshold<float, float>( ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } fclose(src_fp); fclose(ref_fp); free(src_buffer); free(ref_buffer); } else { if (src_fp) fclose(src_fp); if (ref_fp) fclose(ref_fp); } } if (error_count == 0) { printf(" OK\n"); } else { printf(" FAILURE: %d errors...\n", (unsigned int)error_count); } return (error_count == 0); // returns true if all pixels pass }
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath = sdkFindFilePath("person.txt", argv[0]); if (filePath) { sFilename = filePath; } else { printf("Error %s was unable to find person.txt\n", argv[0]); exit(EXIT_FAILURE); } cudaDeviceInit(argc, (const char **)argv); printfNPPinfo(argc, argv); if (g_bQATest == false && (g_nDevice == -1) && argc > 1) { sFilename = argv[1]; } // if we specify the filename at the command line, then we only test sFilename int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "imageSegmentationNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "imageSegmentationNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_segmentation.pgm"; if (argc >= 3 && !g_bQATest) { sResultFilename = argv[2]; } // load MRF declaration int width, height, nLabels; int *hCue, *vCue, *dataCostArray; loadMiddleburyMRFData(sFilename, dataCostArray, hCue, vCue, width, height, nLabels); NPP_ASSERT(nLabels == 2); std::cout << "Dataset: " << sFilename << std::endl; std::cout << "Size: " << width << "x" << height << std::endl; NppiSize size; size.width = width; size.height = height; NppiRect roi; roi.x=0; roi.y=0; roi.width=width; roi.height=height; // Setup flow network int step, transposed_step; Npp32s *d_source, *d_sink, *d_terminals, *d_left_transposed, *d_right_transposed, *d_top, *d_bottom; // Setup terminal capacities d_source = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_source, step, dataCostArray, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_sink = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_sink, step, &dataCostArray[width*height], width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_terminals = nppiMalloc_32s_C1(width, height, &step); nppiSub_32s_C1RSfs(d_sink, step, d_source, step, d_terminals, step, size, 0); // Setup edge capacities NppiSize edgeTranposedSize; edgeTranposedSize.width = height; edgeTranposedSize.height = width-1; NppiSize oneRowTranposedSize; oneRowTranposedSize.width = height; oneRowTranposedSize.height = 1; d_right_transposed = nppiMalloc_32s_C1(height, width, &transposed_step); cudaMemcpy2D(d_right_transposed, transposed_step, hCue, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice); d_left_transposed = nppiMalloc_32s_C1(height, width, &transposed_step); nppiSet_32s_C1R(0, d_left_transposed, transposed_step, oneRowTranposedSize); nppiCopy_32s_C1R(d_right_transposed, transposed_step, d_left_transposed + transposed_step/sizeof(int), transposed_step, edgeTranposedSize); NppiSize edgeSize; edgeSize.width = width; edgeSize.height = height-1; NppiSize oneRowSize; oneRowSize.width = width; oneRowSize.height = 1; d_bottom = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_bottom, step, vCue, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_top = nppiMalloc_32s_C1(width, height, &step); nppiSet_32s_C1R(0, d_top, step, oneRowSize); nppiCopy_32s_C1R(d_bottom, step, d_top + step/sizeof(int), step, edgeSize); // Allocate temp storage for graphcut computation Npp8u *pBuffer; int bufferSize; nppiGraphcutGetSize(size, &bufferSize); cudaMalloc(&pBuffer, bufferSize); NppiGraphcutState *pGraphcutState; nppiGraphcutInitAlloc(size, &pGraphcutState, pBuffer); // Allocate label storage npp::ImageNPP_8u_C1 oDeviceDst(width, height); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // Compute the graphcut, result is 0 / !=0 cudaEventRecord(start,0); nppiGraphcut_32s8u(d_terminals, d_left_transposed, d_right_transposed, d_top, d_bottom, step, transposed_step, size, oDeviceDst.data(), oDeviceDst.pitch(), pGraphcutState); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time, start, stop); std::cout << "Elapsed Time: " << time << " ms" << std::endl; // declare a host image object for an 8-bit grayscale image npp::ImageCPU_8u_C1 oHostAlpha(width, height); // convert graphcut result to 0/255 alpha image using new nppiCompareC_8u_C1R primitive (CUDA 5.0) npp::ImageNPP_8u_C1 oDeviceAlpha(width, height); nppiCompareC_8u_C1R(oDeviceDst.data(), oDeviceDst.pitch(), 0, oDeviceAlpha.data(), oDeviceAlpha.pitch(), size, NPP_CMP_GREATER); // and copy the result to host oDeviceAlpha.copyTo(oHostAlpha.data(), oHostAlpha.pitch()); int E_d, E_s; std::cout << "Graphcut Cost: " << computeEnergy(E_d, E_s, oHostAlpha.data(), oHostAlpha.pitch(), hCue, vCue, dataCostArray, width, height) << std::endl; std::cout << "(E_d = " << E_d << ", E_s = " << E_s << ")" << std::endl; std::cout << "Saving segmentation result as " << sResultFilename << std::endl; saveImage(sResultFilename, oHostAlpha); nppiGraphcutFree(pGraphcutState); cudaFree(pBuffer); cudaFree(d_top); cudaFree(d_bottom); cudaFree(d_left_transposed); cudaFree(d_right_transposed); cudaFree(d_source); cudaFree(d_sink); cudaFree(d_terminals); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } return 0; }