int Lucas::fft_setup (int length) { clAmdFftSetupData fftSetupData; OPENCL_V_THROW(clAmdFftInitSetupData (&fftSetupData),"Failed to clAmdFftInitSetupData."); fftSetupData.debugFlags = CLFFT_DUMP_PROGRAMS; // Dumps the FFT kernels // Setup the AMD FFT library. OPENCL_V_THROW(clAmdFftSetup (&fftSetupData),"Failed to clAmdFftSetup."); // Create FFT Plan const size_t logicalDimensions[1] = { length }; // Create default plan. OPENCL_V_THROW(clAmdFftCreateDefaultPlan(&plan, context(), CLFFT_1D, logicalDimensions),"Failed to clAmdFftCreateDefaultPlan."); // Set double precision. OPENCL_V_THROW(clAmdFftSetPlanPrecision(plan, CLFFT_DOUBLE),"Failed to clAmdFftSetPlanPrecision."); // Set layout. OPENCL_V_THROW(clAmdFftSetLayout(plan, CLFFT_COMPLEX_INTERLEAVED,CLFFT_COMPLEX_INTERLEAVED),"Failed to clAmdFftSetLayout."); // Normalize forward transformation. OPENCL_V_THROW(clAmdFftSetPlanScale(plan, CLFFT_FORWARD, 1.0f/static_cast<cl_float>(length)),"Failed to clAmdFftSetPlanScale."); // Normalize backward transformation. OPENCL_V_THROW(clAmdFftSetPlanScale(plan, CLFFT_BACKWARD, 1.0f),"Failed to clAmdFftSetPlanScale."); // In-place FFT. OPENCL_V_THROW(clAmdFftSetResultLocation(plan, CLFFT_INPLACE),"Failed to clAmdFftSetResultLocation."); // Set number of transformations per plan. OPENCL_V_THROW(clAmdFftSetPlanBatchSize(plan, 1),"Failed to clAmdFftSetPlanBatchSize."); // BakePlan OPENCL_V_THROW(clAmdFftBakePlan(plan, 1, &commandQueue(), NULL, NULL),"Failed to clAmdFftBakePlan."); }
virtual ~clblasFunc() { clblasTeardown(); OPENCL_V_THROW( clReleaseCommandQueue(queue_), "releasing command queue" ); OPENCL_V_THROW( clReleaseContext(ctx_), "releasing context" ); }
clblasFunc(StatisticalTimer& _timer, cl_device_type devType) : timer(_timer) { cl_int err; /* Setup OpenCL environment. */ OPENCL_V_THROW(clGetPlatformIDs(1, &platform_, NULL), "getting platform IDs"); OPENCL_V_THROW(clGetDeviceIDs(platform_, devType, 1, &device_, NULL), "getting device IDs"); props_[0] = CL_CONTEXT_PLATFORM; props_[1] = (cl_context_properties)platform_; props_[2] = 0; ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err); OPENCL_V_THROW(err, "creating context"); queue_ = clCreateCommandQueue(ctx_, device_, 0, &err); timer_id = timer.getUniqueID( "clfunc", 0 ); maxMemAllocSize = queryMemAllocSize( device_ ); /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { std::cerr << "clblasSetup() failed with %d\n"; clReleaseCommandQueue(queue_); clReleaseContext(ctx_); } }
~xGeru() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); }
~xTrmv() { delete buffer_.a_; delete buffer_.x_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.scratch_), "releasing buffer X"); }
~xSyrk() { delete buffer_.a_; delete buffer_.c_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_), "releasing buffer C"); }
void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.c_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_), "releasing buffer C"); }
~xSymv() { delete buffer_.a_; delete buffer_.x_; delete buffer_.y_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_y_), "releasing buffer Y"); }
size_t max_mem_available_on_cl_device(size_t device_index) { std::vector< cl_device_id > device_id; cl_context tempContext = NULL; device_id = initializeCL( g_device_type, (cl_int)device_index, g_platform_id, tempContext, false ); cl_ulong device_max_to_allocate = 0; if( device_id.size() == 0 || device_index > device_id.size() ) { } else { OPENCL_V_THROW( ::clGetDeviceInfo( device_id[device_index], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &device_max_to_allocate, NULL ), "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" ); } cl_command_queue tempQueue = NULL; cl_event tempEvent = NULL; ::cleanupCL( &tempContext, &tempQueue, 0, NULL, 0, NULL, &tempEvent ); return static_cast<size_t>(device_max_to_allocate); }
size_t max_mem_available_on_cl_device(size_t device_index) { static size_t g_device_max_mem_size = 0; // this is not thread-safe using globals, it is just quick fix for now, todo proper fix if (g_device_max_mem_size == 0) { std::vector< cl_device_id > device_id; cl_context tempContext = NULL; device_id = initializeCL( g_device_type, (cl_int)device_index, g_platform_id, tempContext, false ); cl_ulong device_max_to_allocate = 0; if (device_id.size() == 0 || device_index > device_id.size()) { } else { OPENCL_V_THROW(::clGetDeviceInfo(device_id[device_index], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &device_max_to_allocate, NULL), "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )"); } cl_command_queue tempQueue = NULL; cl_event tempEvent = NULL; ::cleanupCL(&tempContext, &tempQueue, 0, NULL, 0, NULL, &tempEvent); g_device_max_mem_size = static_cast<size_t>(device_max_to_allocate); } return g_device_max_mem_size; }
int Lucas::fft_close () { OPENCL_V_THROW(clAmdFftDestroyPlan (&plan),"Failed to clAmdFftDestroyPlan."); OPENCL_V_THROW(clAmdFftTeardown (),"Failed to clAmdFftTeardown."); }
int _tmain( int argc, _TCHAR* argv[] ) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio( false ); // Define MEMORYREPORT on windows platfroms to enable debug memory heap checking #if defined( MEMORYREPORT ) && defined( _WIN32 ) TCHAR logPath[ MAX_PATH ]; ::GetCurrentDirectory( MAX_PATH, logPath ); ::_tcscat_s( logPath, _T( "\\MemoryReport.txt") ); // We leak the handle to this file, on purpose, so that the ::_CrtSetReportFile() can output it's memory // statistics on app shutdown HANDLE hLogFile; hLogFile = ::CreateFile( logPath, GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL ); ::_CrtSetReportMode( _CRT_ASSERT, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_ERROR, _CRTDBG_MODE_FILE | _CRTDBG_MODE_WNDW | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportMode( _CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG ); ::_CrtSetReportFile( _CRT_ASSERT, hLogFile ); ::_CrtSetReportFile( _CRT_ERROR, hLogFile ); ::_CrtSetReportFile( _CRT_WARN, hLogFile ); int tmp = ::_CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmp |= _CRTDBG_LEAK_CHECK_DF | _CRTDBG_ALLOC_MEM_DF | _CRTDBG_CHECK_ALWAYS_DF; ::_CrtSetDbgFlag( tmp ); // By looking at the memory leak report that is generated by this debug heap, there is a number with // {} brackets that indicates the incremental allocation number of that block. If you wish to set // a breakpoint on that allocation number, put it in the _CrtSetBreakAlloc() call below, and the heap // will issue a bp on the request, allowing you to look at the call stack // ::_CrtSetBreakAlloc( 1833 ); #endif /* MEMORYREPORT */ // OpenCL state cl_device_type deviceType = CL_DEVICE_TYPE_ALL; cl_int deviceId = 0; cl_int platformId = 0; // FFT state clfftResultLocation place = CLFFT_INPLACE; clfftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clfftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clfftPrecision precision = CLFFT_SINGLE; clfftDirection dir = CLFFT_FORWARD; size_t lengths[ 3 ] = {1,1,1}; size_t iStrides[ 4 ] = {0,0,0,0}; size_t oStrides[ 4 ] = {0,0,0,0}; cl_uint profile_count = 0; cl_uint command_queue_flags = 0; size_t batchSize = 1; // Initialize flags for FFT library std::auto_ptr< clfftSetupData > setupData( new clfftSetupData ); OPENCL_V_THROW( clfftInitSetupData( setupData.get( ) ), "clfftInitSetupData failed" ); try { // Declare the supported options. po::options_description desc( "clFFT client command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "version,v", "Print queryable version information from the clFFT library" ) ( "clinfo,i", "Print queryable information of all the OpenCL runtimes and devices" ) ( "printChosen", "Print queryable information of the selected OpenCL runtime and device" ) ( "gpu,g", "Force selection of OpenCL GPU devices only" ) ( "cpu,c", "Force selection of OpenCL CPU devices only" ) ( "all,a", "Force selection of all OpenCL devices (default)" ) ( "platform", po::value< cl_int >( &platformId )->default_value( 0 ), "Select a specific OpenCL platform id as it is reported by clinfo" ) ( "device", po::value< cl_int >( &deviceId )->default_value( 0 ), "Select a specific OpenCL device id as it is reported by clinfo" ) ( "outPlace,o", "Out of place FFT transform (default: in place)" ) ( "double", "Double precision transform (default: single)" ) ( "inv", "Backward transform (default: forward)" ) ( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" ) ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" ) ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" ) ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" ) ( "isX", po::value< size_t >( &iStrides[ 0 ] )->default_value( 1 ), "Specify the input stride of the 1st dimension of a test array" ) ( "isY", po::value< size_t >( &iStrides[ 1 ] )->default_value( 0 ), "Specify the input stride of the 2nd dimension of a test array" ) ( "isZ", po::value< size_t >( &iStrides[ 2 ] )->default_value( 0 ), "Specify the input stride of the 3rd dimension of a test array" ) ( "iD", po::value< size_t >( &iStrides[ 3 ] )->default_value( 0 ), "input distance between subsequent sets of data when batch size > 1" ) ( "osX", po::value< size_t >( &oStrides[ 0 ] )->default_value( 1 ), "Specify the output stride of the 1st dimension of a test array" ) ( "osY", po::value< size_t >( &oStrides[ 1 ] )->default_value( 0 ), "Specify the output stride of the 2nd dimension of a test array" ) ( "osZ", po::value< size_t >( &oStrides[ 2 ] )->default_value( 0 ), "Specify the output stride of the 3rd dimension of a test array" ) ( "oD", po::value< size_t >( &oStrides[ 3 ] )->default_value( 0 ), "output distance between subsequent sets of data when batch size > 1" ) ( "batchSize,b", po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " ) ( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 1 ), "Time and report the kernel speed of the FFT (default: profiling off)" ) ( "inLayout", po::value< clfftLayout >( &inLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ( "outLayout", po::value< clfftLayout >( &outLayout )->default_value( CLFFT_COMPLEX_INTERLEAVED ), "Layout of input data:\n1) interleaved\n2) planar\n3) hermitian interleaved\n4) hermitian planar\n5) real" ) ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "version" ) ) { const int indent = countOf( "clFFT client API version: " ); tout << std::left << std::setw( indent ) << _T( "clFFT client API version: " ) << clfftVersionMajor << _T( "." ) << clfftVersionMinor << _T( "." ) << clfftVersionPatch << std::endl; cl_uint libMajor, libMinor, libPatch; clfftGetVersion( &libMajor, &libMinor, &libPatch ); tout << std::left << std::setw( indent ) << _T( "clFFT runtime version: " ) << libMajor << _T( "." ) << libMinor << _T( "." ) << libPatch << std::endl << std::endl; } if( vm.count( "help" ) ) { // This needs to be 'cout' as program-options does not support wcout yet std::cout << desc << std::endl; return 0; } size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) | ((vm.count( "cpu" ) > 0) ? 2 : 0) | ((vm.count( "all" ) > 0) ? 4 : 0); if ((mutex & (mutex-1)) != 0) { terr << _T("You have selected mutually-exclusive OpenCL device options:") << std::endl; if (vm.count ( "gpu" ) > 0) terr << _T(" gpu,g Force selection of OpenCL GPU devices only" ) << std::endl; if (vm.count ( "cpu" ) > 0) terr << _T(" cpu,c Force selection of OpenCL CPU devices only" ) << std::endl; if (vm.count ( "all" ) > 0) terr << _T(" all,a Force selection of all OpenCL devices (default)" ) << std::endl; return 1; } if( vm.count( "gpu" ) ) { deviceType = CL_DEVICE_TYPE_GPU; } if( vm.count( "cpu" ) ) { deviceType = CL_DEVICE_TYPE_CPU; } if( vm.count( "all" ) ) { deviceType = CL_DEVICE_TYPE_ALL; } if( vm.count( "clinfo" ) ) { std::vector< cl_platform_id > platformInfos; std::vector< std::vector< cl_device_id > > deviceInfos; discoverCLPlatforms( deviceType, platformInfos, deviceInfos ); prettyPrintCLPlatforms(platformInfos, deviceInfos); return 0; } bool printInfo = false; if( vm.count( "printChosen" ) ) { printInfo = true; } if( vm.count( "outPlace" ) ) { place = CLFFT_OUTOFPLACE; } if( vm.count( "double" ) ) { precision = CLFFT_DOUBLE; } if( vm.count( "inv" ) ) { dir = CLFFT_BACKWARD; } if( profile_count > 1 ) { command_queue_flags |= CL_QUEUE_PROFILING_ENABLE; } if( vm.count( "dumpKernels" ) ) { setupData->debugFlags |= CLFFT_DUMP_PROGRAMS; } int inL = (int)inLayout; int otL = (int)outLayout; // input output layout support matrix int ioLayoutSupport[5][5] = { { 1, 1, 0, 0, 1 }, { 1, 1, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 1 }, { 1, 1, 1, 1, 0 }, }; if((inL < 1) || (inL > 5)) throw std::runtime_error( "Invalid Input layout format" ); if((otL < 1) || (otL > 5)) throw std::runtime_error( "Invalid Output layout format" ); if(ioLayoutSupport[inL-1][otL-1] == 0) throw std::runtime_error( "Invalid combination of Input/Output layout formats" ); if( ((inL == 1) || (inL == 2)) && ((otL == 1) || (otL == 2)) ) // Complex-Complex cases { iStrides[1] = iStrides[1] ? iStrides[1] : lengths[0] * iStrides[0]; iStrides[2] = iStrides[2] ? iStrides[2] : lengths[1] * iStrides[1]; iStrides[3] = iStrides[3] ? iStrides[3] : lengths[2] * iStrides[2]; if(place == CLFFT_INPLACE) { oStrides[0] = iStrides[0]; oStrides[1] = iStrides[1]; oStrides[2] = iStrides[2]; oStrides[3] = iStrides[3]; } else { oStrides[1] = oStrides[1] ? oStrides[1] : lengths[0] * oStrides[0]; oStrides[2] = oStrides[2] ? oStrides[2] : lengths[1] * oStrides[1]; oStrides[3] = oStrides[3] ? oStrides[3] : lengths[2] * oStrides[2]; } } else // Real-Complex and Complex-Real cases { size_t *rst, *cst; size_t N = lengths[0]; size_t Nt = 1 + lengths[0]/2; bool iflag = false; bool rcFull = (inL == 1) || (inL == 2) || (otL == 1) || (otL == 2); if(inLayout == CLFFT_REAL) { iflag = true; rst = iStrides; } else { rst = oStrides; } // either in or out should be REAL // Set either in or out strides whichever is real if(place == CLFFT_INPLACE) { if(rcFull) { rst[1] = rst[1] ? rst[1] : N * 2 * rst[0]; } else { rst[1] = rst[1] ? rst[1] : Nt * 2 * rst[0]; } rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } else { rst[1] = rst[1] ? rst[1] : lengths[0] * rst[0]; rst[2] = rst[2] ? rst[2] : lengths[1] * rst[1]; rst[3] = rst[3] ? rst[3] : lengths[2] * rst[2]; } // Set the remaining of in or out strides that is not real if(iflag) { cst = oStrides; } else { cst = iStrides; } if(rcFull) { cst[1] = cst[1] ? cst[1] : N * cst[0]; } else { cst[1] = cst[1] ? cst[1] : Nt * cst[0]; } cst[2] = cst[2] ? cst[2] : lengths[1] * cst[1]; cst[3] = cst[3] ? cst[3] : lengths[2] * cst[2]; } if( precision == CLFFT_SINGLE ) transform<float>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); else transform<double>( lengths, iStrides, oStrides, batchSize, inLayout, outLayout, place, precision, dir, deviceType, deviceId, platformId, printInfo, command_queue_flags, profile_count, setupData ); } catch( std::exception& e ) { terr << _T( "clFFT error condition reported:" ) << std::endl << e.what() << std::endl; return 1; } return 0; }
int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size, clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place, clfftPrecision precision, clfftDirection dir, cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo, cl_uint command_queue_flags, cl_uint profile_count, std::auto_ptr< clfftSetupData > setupData ) { // Our command line does not specify what dimension FFT we wish to transform; we decode // this from the lengths that the user specifies for X, Y, Z. A length of one means that // The user does not want that dimension. const size_t max_dimensions = 3; size_t strides[ 4 ]; size_t o_strides[ 4 ]; size_t fftVectorSize = 0; size_t fftVectorSizePadded = 0; size_t fftBatchSize = 0; size_t outfftVectorSize = 0; size_t outfftVectorSizePadded = 0; size_t outfftBatchSize = 0; size_t size_of_input_buffers_in_bytes = 0; size_t size_of_output_buffers_in_bytes = 0; cl_uint number_of_output_buffers = 0; clfftDim dim = CLFFT_1D; cl_mem input_cl_mem_buffers [2] = { NULL, NULL }; cl_mem output_cl_mem_buffers[2] = { NULL, NULL }; std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { if (0 != lengths[u]) continue; lengths[u] = 1; } if( lengths[ 1 ] > 1 ) { dim = CLFFT_2D; } if( lengths[ 2 ] > 1 ) { dim = CLFFT_3D; } strides[ 0 ] = inStrides[0]; strides[ 1 ] = inStrides[1]; strides[ 2 ] = inStrides[2]; strides[ 3 ] = inStrides[3]; o_strides[ 0 ] = outStrides[0]; o_strides[ 1 ] = outStrides[1]; o_strides[ 2 ] = outStrides[2]; o_strides[ 3 ] = outStrides[3]; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; size_t Nt = 1 + lengths[0]/2; if(place == CLFFT_INPLACE) { outfftVectorSize = fftVectorSize; outfftVectorSizePadded = fftVectorSizePadded; outfftBatchSize = fftBatchSize; } else { outfftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; } // Real to complex case if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) ) { fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSize = fftVectorSize; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_COMPLEX_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_HERMITIAN_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_HERMITIAN_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_REAL: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; } // Fill the input buffers switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; input[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_COMPLEX_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; input[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; real[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_REAL: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false ); if( timerLibHandle == NULL ) { terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); // Create and initialize our timer class, if the external timer shared library loaded baseStatTimer* timer = NULL; size_t clFFTID = 0; if( get_timer ) { timer = get_timer( CLFFT_GPU ); timer->Reserve( 1, profile_count ); timer->setNormalize( true ); clFFTID = timer->getUniqueID( "clFFT", 0 ); } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" ); OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" ); OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" ); OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" ); OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" ); OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" ); OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" ); // Set backward scale factor to 1.0 for non real FFTs to do correct output checks if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL) OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" ); OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" ); //get the buffersize size_t buffersize=0; OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (buffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" ); } switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize input layout return CLFFT_INVALID_ARG_VALUE; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize output layout return CLFFT_INVALID_ARG_VALUE; } if (( place == CLFFT_INPLACE ) && ( in_layout != out_layout )) { switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_COMPLEX_PLANAR: { if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) ) { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); } break; } case CLFFT_HERMITIAN_INTERLEAVED: { if( out_layout != CLFFT_REAL ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_HERMITIAN_PLANAR: { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } case CLFFT_REAL: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } } } // Loop as many times as the user specifies to average out the timings // cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; Timer tr; tr.Start(); for( cl_uint i = 0; i < profile_count; ++i ) { if( timer ) timer->Start( clFFTID ); OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); if( timer ) timer->Stop( clFFTID ); } OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); if(clMedBuffer) clReleaseMemObject(clMedBuffer); double wtime = tr.Sample()/((double)profile_count); size_t totalLen = 1; for(int i=0; i<dim; i++) totalLen *= lengths[i]; double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0); if(profile_count > 1) { tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl; tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl; } if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) ) { // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result timer->pruneOutliers( 2.0 ); timer->Print( ); timer->Reset( ); } /*****************/ FreeSharedLibrary( timerLibHandle ); // Read and check output data // This check is not valid if the FFT is executed multiple times inplace. // if (( place == CLFFT_OUTOFPLACE ) || ( profile_count == 1)) { bool checkflag= false; switch( out_layout ) { case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_COMPLEX_INTERLEAVED: { std::vector< std::complex< T > > output( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } //check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (output[i].real() != outfftVectorSize) { checkflag = true; break; } } else { if (output[ i ].real() != 0) { checkflag = true; break; } } if (output[ i ].imag() != 0) { checkflag = true; break; } } } break; case CLFFT_HERMITIAN_PLANAR: case CLFFT_COMPLEX_PLANAR: { std::valarray< T > real( outfftBatchSize ); std::valarray< T > imag( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } // Check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (real[i] != outfftVectorSize) { checkflag = true; break; } } else { if (real[i] != 0) { checkflag = true; break; } } if (imag[i] != 0) { checkflag = true; break; } } } break; case CLFFT_REAL: { std::valarray< T > real( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } ////check output data for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * o_strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * o_strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * o_strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * o_strides[0]; if (real[p0] != 1) { checkflag = true; break; } } } } } } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } if (checkflag) { std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl; } else { std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl; } } OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" ); OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" ); cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent ); return 0; }
int main() { //Control Variables bool showStartInput=false;// Setting it to true shows the original Input bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle bool showReshuffleOutput=false;// Shows the output after the reshuffle bool showFinalResult=false; // Shows final result after cross-correlation bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling //openCL State cl_platform_id platform_id=NULL; cl_device_id device_id=NULL; cl_context context=NULL; cl_command_queue queue=NULL; cl_program program=NULL; cl_kernel kernel=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret=0; // Stores the error values retuned by many functions cl_event event = NULL; cl_event events[10]; cl_kernel clKernel; //FFT state clAmdFftPlanHandle plHandle; clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE clAmdFftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftDim dim = CLFFT_1D; size_t clStrides[3]={0,0,0}; size_t clLengths[3]; clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft clLengths[1]=1;//length of second dimension of fft clLengths[2]=1; clStrides[ 0 ] = 1; clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ]; clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ]; clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ]; size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously //Initialise openCL OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed"); OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed"); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); OPENCL_V_THROW(ret, "Creating Context failed" ); queue = clCreateCommandQueue(context, device_id, 0, &ret); OPENCL_V_THROW(ret, "Creating command queue failed" ); //===========Initialise the host buffers====================================== /* * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h */ float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer if(showStartInput){ cout << "Initial Input Buffer" << "\n"; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << src_a_h[j][i] << " "; }cout << "\n"; }printf("\n"); } //=================================================================== //Calculation of facs for reshuffling complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2)); complex<float> I=1.0i; complex <float> xx=2.0*PI; for(int i=0;i<MEM_SIZE/2;i++){ facs_h[i]=(1.0*i)/(1.0*MEM_SIZE); facs_h[i]=exp(xx*(-I*facs_h[i])); } //=================================================================== //Initialise GPU memory buffers size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float); size_t sizeoffacsmem=MEM_SIZE*sizeof(float); cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret); OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" ); cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret); OPENCL_V_THROW (ret, "Creating fft output Buffer failed"); cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret); OPENCL_V_THROW (ret, "Creating facs Buffer failed"); //===========================Starting the fft=============================// clAmdFftSetupData setupData; OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" ); OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" ); OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" ); OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed"); OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" ); OPENCL_V_THROW (clAmdFftSetPlanInStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanDistance ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" ); OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" ); size_t tempbuffersize=0; OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (tempbuffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" ); } if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) { switch( inLayout ) { case CLFFT_COMPLEX_INTERLEAVED: { assert (CLFFT_COMPLEX_PLANAR == outLayout); throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); break; } case CLFFT_COMPLEX_PLANAR: { assert (CLFFT_COMPLEX_INTERLEAVED == outLayout); throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } } } cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &clMemBuffersOut; //========Timimg fft============// double time_fft_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" ); } ret=clWaitForEvents(1,&event); double time_fft_end=omp_get_wtime(); //Cleaning up fft OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" ); OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" ); //displaying results if(showFftOutput){ OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" ); cout << "**FFT Output**" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //==================End of FFT=============================================// //==================Start the Reshuffling==================================// FILE *fp; char fileName[]="./reshuffle.cl"; char* source_str=NULL; size_t source_size; //Load the source code containing the kernel/ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load reshuffle kernel.¥n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); //Preparation for building the Kernel program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret); OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" ); OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle"); kernel = clCreateKernel(program, "reshuffle", &ret); OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" ); //Set kernel parameters const int num=NO_INPUTS*MEM_SIZE; const int block=MEM_SIZE; OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed"); // Execute OpenCL Kernel // const size_t local_ws=NO_INPUTS*MEM_SIZE; const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws); //===========timing the reshuffle===============// double time_reshuffle_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed"); } double time_reshuffle_end=omp_get_wtime(); //Read back data OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed"); //====================Finish the reshuffling================================// if(showReshuffleOutput){ cout << "Output after reshuffling" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //=================Reformatting the input given to the matrix multiply===================================// float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2); for(int i=0;i<NO_INPUTS;i++){ for(int j=0;j<MEM_SIZE;j++){ if(j&1) answer_final[(i<<1)+1][j >> 1]=answer[i][j]; else answer_final[(i<<1)][j >> 1]=answer[i][j]; } }