// Function to generate CPU reference filter //! @param pIn Input Image //! @param pTwiddleDFT DFT twiddle matrix //! @param pTwiddleIDFT IDFT twiddle matrix //! @param pFilterCoeffs Filter coefficients void calcGoldRef( unsigned char *pIn, double complex *pTwiddleDFT, double complex *pTwiddleIDFT, float *pFilterCoeffs ) { // Output image char *fnameOut = "data/lena_filtered_cpu_lp.pgm"; // Benchmark time stamps clock_t timer; timer = clock(); // Capture local iterators unsigned char *m_pIn = pIn; double complex *m_pTwiddleDFT = pTwiddleDFT; double complex *m_pTwiddleIDFT = pTwiddleIDFT; float *m_pFilterCoeffs = pFilterCoeffs; // Calculate row-wise 1D DFT double complex *m_pTemp, *m_pTempRef; m_pTemp = new double complex[N*N]; m_pTempRef = m_pTemp; memset( m_pTemp, 0, sizeof( double complex ) * N * N ); for ( int i = 0; i < N; ++i ) { m_pIn = pIn + i * N; for (int j = 0; j < N; ++j, ++m_pTemp ) { m_pTwiddleDFT = pTwiddleDFT + j; m_pIn = pIn + i * N; for ( int k = 0; k < N; ++k, ++m_pIn ) { *m_pTemp += (double)*m_pIn * *m_pTwiddleDFT; m_pTwiddleDFT += N; } } } std::cout << "Finished 1D DFT .." << std::endl; // Reset pointers for next stage m_pTemp = m_pTempRef; m_pTwiddleDFT = pTwiddleDFT; // Next, calculate column-wise 1D DFT double complex *m_p2D_xfm, *m_p2D_xfmRef; m_p2D_xfm = new double complex[N*N]; m_p2D_xfmRef = m_p2D_xfm; memset( m_p2D_xfm, 0, sizeof( double complex ) * N * N ); for ( int i = 0; i < N; ++i ) { m_pTwiddleDFT = pTwiddleDFT + i * N; for (int j = 0; j < N; ++j, ++m_p2D_xfm ) { m_pTwiddleDFT = pTwiddleDFT + i * N; m_pTemp = m_pTempRef + j; for ( int k = 0; k < N; ++k, ++m_pTwiddleDFT ) { *m_p2D_xfm += *m_pTemp * *m_pTwiddleDFT; m_pTemp += N; } } } std::cout << "Finished 2D DFT .." << std::endl; // Reset pointers for next stages m_p2D_xfm = m_p2D_xfmRef; m_pTemp = m_pTempRef; // Next, apply filter to the transformed data double complex *m_pFilterOut, *m_pFilterOutRef; m_pFilterOut = new double complex[N*N]; m_pFilterOutRef = m_pFilterOut; memset( m_pFilterOut, 0, sizeof( double complex ) * N * N ); // Perform element multiplication for filter application for ( int i = 0; i < N * N; ++i, ++m_pFilterCoeffs, ++m_p2D_xfm, ++m_pFilterOut ) *m_pFilterOut = *m_pFilterCoeffs * *m_p2D_xfm; std::cout << "Finished Filtering .." << std::endl; // Reset pointers for next stages m_pFilterOut = m_pFilterOutRef; m_p2D_xfm = m_p2D_xfmRef; // Next, apply row-wise 1D IDFT - dump in already allocated Temp matrix memset( m_pTemp, 0, sizeof( double complex ) * N * N ); for ( int i = 0; i < N; ++i ) { m_pFilterOut = m_pFilterOutRef + i * N; for ( int j = 0; j < N; ++j, ++m_pTemp ) { m_pTwiddleIDFT = pTwiddleIDFT + j; m_pFilterOut = m_pFilterOutRef + i * N; for ( int k = 0; k < N; ++k, ++m_pFilterOut ) { *m_pTemp += *m_pFilterOut * *m_pTwiddleIDFT; m_pTwiddleIDFT += N; } } } std::cout << "Finished 1D IDFT .." << std::endl; // Reset pointer for next stage m_pTemp = m_pTempRef; m_pTwiddleIDFT = pTwiddleIDFT; // Next, apply column-wise 1D IDFT - dump in already allocated 2D_xfm matrix memset( m_p2D_xfm, 0, sizeof( double complex ) * N * N ); for ( int i = 0; i < N; ++i ) { m_pTwiddleIDFT = pTwiddleIDFT + i * N; for ( int j = 0; j < N; ++j, ++m_p2D_xfm ) { m_pTemp = m_pTempRef + j; m_pTwiddleIDFT = pTwiddleIDFT + i * N; for ( int k = 0; k < N; ++k, ++m_pTwiddleIDFT ) { *m_p2D_xfm += *m_pTemp * *m_pTwiddleIDFT; m_pTemp += N; } } } std::cout << "Finished 2D IDFT .." << std::endl; // Reset output pointer m_p2D_xfm = m_p2D_xfmRef; // Compute magnitude and convert output result to unsigned char for pgm write unsigned char *m_pOut = new unsigned char[N*N]; unsigned char *m_pOutRef = m_pOut; for ( int i = 0; i < N * N; ++i, ++m_p2D_xfm, ++m_pOut ) *m_pOut = (unsigned char) sqrt( pow( creal(*m_p2D_xfm), 2 ) + pow( cimag(*m_p2D_xfm), 2 ) ); // Generate end time stamp and report performance timer = clock() - timer; std::cout << "Total CPU execution time = " << static_cast<float>(timer) / CLOCKS_PER_SEC << " seconds" << std::endl; // Write output to file if( !sdkSavePGM( fnameOut, m_pOutRef, N, N ) ) std::cout << "Error Saving CPU output file!!" << std::endl; else std::cout << "Finished writing to CPU output!" << std::endl; // Cleanup local memory allocations delete[] m_pTempRef; delete[] m_p2D_xfmRef; delete[] m_pFilterOutRef; }
int main(int argc, char *argv[]) { char *fnameOut = "data/lena_filtered_gpu_lp.pgm"; // Output image unsigned char *h_pImgResult = NULL; // Output handle Complex *h_pTwiddleDFT = NULL; // Host DFT Twiddle Matrix Complex *h_pTwiddleIDFT = NULL; // Host IDFT Twiddle Matrix double complex *h_pTwiddleDFT_z = NULL; // Host CPU DFT Twiddle Matrix double complex *h_pTwiddleIDFT_z = NULL; // Host CPU IDFT Twiddle Matrix float *h_pFilterCoeffs = NULL; // Host Filter Coefficent Matrix static const int dev = 0; // Hard code to use device 0 cudaEvent_t start, stop; // Cuda events for benchmarking float time; // Performance timer for benchmarking std::string filterOption ("lowpass"); // Cmd line filter option // Load the image loadDefaultImage( argv[0] ); // Capture image reference unsigned char *m_imgRef = h_pImage; std::cout << "Image width = " << imWidth << " and image height = " << imHeight << std::endl; // Define number of threads and blocks to be mapped to the GPU dim3 nThreads = dim3( blkSize_x, blkSize_y, 1 ); dim3 nBlocks = dim3( ceil( imWidth / nThreads.x ), ceil( imHeight / nThreads.y ) ); int nPixels = imWidth * imHeight; unsigned int imSz = sizeof( unsigned char ) * nPixels; // Allocate host memory for the result h_pImgResult = static_cast<unsigned char*>( malloc( imSz ) ); unsigned char *m_imgOut = h_pImgResult; // Generate the DFT and IDFT twiddle matrix to be dumped to the GPU // and for CPU reference implementation unsigned int twiddleSz = sizeof( Complex ) * nPixels; unsigned int twiddleSz_z = sizeof( double complex ) * nPixels; h_pTwiddleDFT = static_cast<Complex*>( malloc( twiddleSz ) ); h_pTwiddleDFT_z = static_cast<double complex*>( malloc( twiddleSz_z ) ); twiddleMatrixGen( false, h_pTwiddleDFT, h_pTwiddleDFT_z ); h_pTwiddleIDFT = static_cast<Complex*>( malloc ( twiddleSz ) ); h_pTwiddleIDFT_z = static_cast<double complex*>( malloc( twiddleSz_z ) ); twiddleMatrixGen( true, h_pTwiddleIDFT, h_pTwiddleIDFT_z ); // Allocate and generate the Filter Coefficient Matrix unsigned int filterSz = sizeof( float ) * nPixels; h_pFilterCoeffs = static_cast<float*>( malloc( filterSz ) ); if ( filterOption == "lowpass" ) filterCoeffGen( true, W_C, h_pFilterCoeffs ); else filterCoeffGen( false, W_C, h_pFilterCoeffs ); // Initialize the result buffer for ( int i = 0; i < nPixels; ++i ) h_pImgResult[i] = 0; // Allocate device memory unsigned char *d_pImage, *d_pImgResult; Complex *d_pTwiddleDFT, *d_pTwiddleIDFT, *d_pTempMatrix, *d_p2D_xfm, *d_pFilterOut; float *d_pFilterCoeffs; checkCudaErrors( cudaMalloc( (void**) &d_pImage, imSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pImgResult, imSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pTwiddleDFT, twiddleSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pTwiddleIDFT, twiddleSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pTempMatrix, twiddleSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_p2D_xfm, twiddleSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pFilterCoeffs, filterSz ) ); checkCudaErrors( cudaMalloc( (void**) &d_pFilterOut, filterSz ) ); // Copy host memory to the device - use h_pImgResult for zeroing device result buffer checkCudaErrors( cudaMemcpy( d_pImage, h_pImage, imSz, cudaMemcpyHostToDevice ) ); checkCudaErrors( cudaMemcpy( d_pImgResult, h_pImgResult, imSz, cudaMemcpyHostToDevice ) ); checkCudaErrors( cudaMemcpy( d_pTwiddleDFT, h_pTwiddleDFT, twiddleSz, cudaMemcpyHostToDevice ) ); checkCudaErrors( cudaMemcpy( d_pTwiddleIDFT, h_pTwiddleIDFT, twiddleSz, cudaMemcpyHostToDevice ) ); checkCudaErrors( cudaMemcpy( d_pFilterCoeffs, h_pFilterCoeffs, filterSz, cudaMemcpyHostToDevice ) ); // Create CUDA events for the timer checkCudaErrors ( cudaEventCreate( &start ) ); checkCudaErrors ( cudaEventCreate( &stop ) ); // Kickoff start timer and dispatch the row-wise DFT kernel, block until the kernel returns checkCudaErrors( cudaEventRecord( start, NULL ) ); dispatchDFTkernel( d_pImage, d_pTwiddleDFT, d_pTempMatrix, d_p2D_xfm, false, true, d_pImgResult, nBlocks, nThreads ); cudaDeviceSynchronize(); // Dispatch the column-wise DFT kernel, block until the kernel returns dispatchDFTkernel( d_pImage, d_pTwiddleDFT, d_pTempMatrix, d_p2D_xfm, false, false, d_pImgResult, nBlocks, nThreads ); cudaDeviceSynchronize(); // Dispatch the filter kernel, block until the kernel returns dispatchFilterKernel( d_p2D_xfm, d_pFilterCoeffs, d_pTempMatrix, nBlocks, nThreads ); cudaDeviceSynchronize(); // Dispatch the row-wise IDFT kernel, block until the kernel returns dispatchDFTkernel( d_pImage, d_pTwiddleIDFT, d_pTempMatrix, d_p2D_xfm, true, true, d_pImgResult, nBlocks, nThreads ); cudaDeviceSynchronize(); // Dispatch the column-wise IDFT kernel, block until the kernel returns dispatchDFTkernel( d_pImage, d_pTwiddleIDFT, d_pTempMatrix, d_p2D_xfm, true, false, d_pImgResult, nBlocks, nThreads ); cudaDeviceSynchronize(); // Test checkCudaErrors( cudaMemcpy( h_pImgResult, d_pImgResult, imSz, cudaMemcpyDeviceToHost ) ); // Generate stop event and record execution time checkCudaErrors( cudaEventRecord( stop, NULL ) ); checkCudaErrors( cudaEventSynchronize( stop ) ); checkCudaErrors( cudaEventElapsedTime( &time, start, stop ) ); checkCudaErrors( cudaEventDestroy( start ) ); checkCudaErrors( cudaEventDestroy( stop ) ); // Print out time results std::cout << "Total GPU Execution time = " << time/1000 << " seconds" << std::endl; // Save our result if( !sdkSavePGM( fnameOut, m_imgOut, imWidth, imHeight ) ) std::cout << "Error Saving output file!!" << std::endl; else std::cout << "Finished writing to output!" << std::endl; cudaDeviceReset(); // Calculate CPU reference output and benchmark calcGoldRef( h_pImage, h_pTwiddleDFT_z, h_pTwiddleIDFT_z, h_pFilterCoeffs ); // Host cleanup - CUDA device reset reclaims device memory allocations if ( h_pImgResult != NULL ) free ( h_pImgResult ); if ( h_pTwiddleDFT != NULL ) free ( h_pTwiddleDFT ); if ( h_pTwiddleDFT_z != NULL ) free ( h_pTwiddleDFT_z ); if ( h_pTwiddleIDFT != NULL ) free ( h_pTwiddleIDFT ); if ( h_pTwiddleIDFT_z != NULL ) free ( h_pTwiddleIDFT_z ); if ( h_pFilterCoeffs != NULL ) free ( h_pFilterCoeffs ); exit(EXIT_SUCCESS); }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }