bool clutl::Module::Load(clcpp::Database* host_db, const char* filename) { // Load the DLL m_Handle = LoadSharedLibrary(filename); if (m_Handle == 0) return false; // Keep this around for registering interface implementations clcpp::internal::Assert(host_db != 0); m_HostReflectionDB = host_db; // Get the module reflection database typedef clcpp::Database* (*GetReflectionDatabaseFunc)(); GetReflectionDatabaseFunc GetReflectionDatabase = (GetReflectionDatabaseFunc)GetSharedLibraryFunction(m_Handle, "GetReflectionDatabase"); if (GetReflectionDatabase) m_ReflectionDB = GetReflectionDatabase(); // Ask the DLL to register and interface implementations it has typedef void (*AddReflectionImplsFunc)(Module*); AddReflectionImplsFunc AddReflectionImpls = (AddReflectionImplsFunc)GetSharedLibraryFunction(m_Handle, "AddReflectionImpls"); if (AddReflectionImpls) AddReflectionImpls(this); return true; }
static inline CRuntimePtr CreateCRuntime (const pI_bool load_properties = pI_FALSE) { SharedLibrary* crtsl = 0; SharedLibrary* cppimplsl = 0; CreateCRuntimeFunc cfunc = 0; CRuntimePtr crt = 0; SharedLibrary** injectptr = 0; // load functionality from shared library crtsl = LoadSharedLibrary (pI_RUNTIME_MAKE_SHARED_LIB_NAME("pIRuntime")); if (crtsl == 0) return 0; cppimplsl = LoadSharedLibrary (pI_RUNTIME_MAKE_SHARED_LIB_NAME("pICppRuntimeImpl")); if (cppimplsl == 0) { UnloadSharedLibrary (crtsl); return 0; } // fetch runtime creation function from shared library cfunc = (CreateCRuntimeFunc) GetSharedLibraryFunction (crtsl, "CreateCRuntime"); if (cfunc == 0) { UnloadSharedLibrary (crtsl); UnloadSharedLibrary (cppimplsl); return 0; } // instantiate runtime crt = cfunc (load_properties); if (crt == 0) { UnloadSharedLibrary (crtsl); UnloadSharedLibrary (cppimplsl); return 0; } // inject shared libraries into runtime injectptr = (SharedLibrary**) crt->AllocateMemory (crt, sizeof(SharedLibrary*) * 2); injectptr[0] = crtsl; injectptr[1] = cppimplsl; crt->shared_library_data = injectptr; return crt; } // static inline CRuntimePtr CreateCRuntime (const pI_bool load_properties = pI_FALSE)
clsparseControl clsparseCreateControl( cl_command_queue queue, clsparseStatus *status ) { clsparseControl control = new _clsparseControl( queue ); clsparseStatus err = clsparseSuccess; if( !control ) { control = nullptr; err = clsparseOutOfHostMemory; } control->event = nullptr; // control->off_alpha = 0; // control->off_beta = 0; // control->off_x = 0; // control->off_y = 0; control->wavefront_size = 0; control->max_wg_size = 0; control->async = false; control->extended_precision = false; control->dpfp_support = false; collectEnvParams( control ); // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "clsparseTimer", false ); if( timerLibHandle ) { // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module // PFCLSPARSETIMER pfclsparseTimer = static_cast<PFCLSPARSETIMER> ( LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" ) ); void* funcPtr = LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" ); PFCLSPARSETIMER pfclsparseTimer = *static_cast<PFCLSPARSETIMER*>( static_cast<void*>( &funcPtr ) ); // Create and initialize our timer class, if the external timer shared library loaded if( pfclsparseTimer ) { control->pDeviceTimer = static_cast<clsparseDeviceTimer*> ( pfclsparseTimer( CLSPARSE_GPU ) ); } } if( status != NULL ) { *status = err; } return control; }
Module * ModuleFactory::createModule(const char * path) const { Info << "Loading " << path; void *hDLL; //do not add extension. It is handled by LoadSharedLibrary. hDLL = LoadSharedLibrary(path); if (hDLL == 0) return NULL; Instance instanciate = (Instance)GetFunction(hDLL, "create"); Destroy destroy = (Destroy)GetFunction(hDLL, "destroy"); if (instanciate == 0 || destroy == 0) return NULL; Module * module = instanciate(); module->setCleanLibraryCallback(new CleanLibraryCallBack(hDLL, destroy)); return module; }
int main( int argc, char *argv[ ] ) { cl_double alpha, beta; clsparseIdx_t rows, columns; size_t profileCount; std::string function; std::string precision; std::string root_dir; po::options_description desc( "clSPARSE bench command line options" ); desc.add_options( ) ( "help,h", "produces this help message" ) ( "dirpath,d", po::value( &root_dir ), "Matrix directory" ) ( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" ) ( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" ) ( "rows", po::value<clsparseIdx_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" ) ( "columns", po::value<clsparseIdx_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" ) ( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: " "SpMdV, SpMdM, SpMSpM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" ) ( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" ) ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" ) ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" ) ( "no_zeroes,z", po::bool_switch()->default_value(false), "Disable reading explicit zeroes from the input matrix market file.") ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "help" ) ) { std::cout << desc << std::endl; return 0; } if( precision != "s" && precision != "d" ) // && precision != "c" && precision != "z" ) { std::cerr << "Invalid value for --precision" << std::endl; return -1; } if( vm.count( "dirpath" ) == 0 ) { std::cerr << "The [" << "root" << "] parameter is missing!" << std::endl; std::cerr << desc << std::endl; return false; } // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "clsparseTimer", false ); if( timerLibHandle == NULL ) { std::cerr << "Could not find the external timing library; timings disabled" << std::endl; } cl_bool extended_precision = false; if (vm["extended"].as<bool>()) extended_precision = true; cl_bool explicit_zeroes = true; if (vm["no_zeroes"].as<bool>()) explicit_zeroes = false; // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module void* funcPtr = LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" ); PFCLSPARSETIMER sparseGetTimer = *static_cast<PFCLSPARSETIMER*>( static_cast<void*>( &funcPtr ) ); std::unique_ptr< clsparseFunc > my_function; if( boost::iequals( function, "SpMdV" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else if( precision == "d" ) my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else { std::cerr << "Unknown spmdv precision" << std::endl; return -1; } } else if( boost::iequals( function, "CG" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xCG< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xCG< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "BiCGStab" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xBiCGStab< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xBiCGStab< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "SpMdM" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xSpMdM< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, columns, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xSpMdM< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, columns, explicit_zeroes ) ); } else if (boost::iequals(function, "SpMSpM")) { if (precision == "s") my_function = std::unique_ptr< clsparseFunc>(new xSpMSpM< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >(new xSpMSpM< cl_double >(sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "Coo2Csr" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xCoo2Csr< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xCoo2Csr< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "Dense2Csr" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xDense2Csr< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xDense2Csr< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "Csr2Dense" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xCsr2Dense< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xCsr2Dense< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else if( boost::iequals( function, "Csr2Coo" ) ) { if( precision == "s" ) my_function = std::unique_ptr< clsparseFunc >( new xCsr2Coo< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); else my_function = std::unique_ptr< clsparseFunc >( new xCsr2Coo< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) ); } else { std::cerr << "Benchmarking unknown function" << std::endl; return -1; } try { std::vector< fs::path > matrix_files = enumMatrices( root_dir ); for( auto& file : matrix_files ) { std::string path = file.string( ); try { my_function->setup_buffer( alpha, beta, path ); } // I expect to catch trow from clsparseHeaderfromFile // If io_exception then we don't need to cleanup. // If runtime_exception is catched we are doomed! catch( clsparse::io_exception& io_exc ) { std::cout << io_exc.what( ) << std::endl; continue; } my_function->initialize_cpu_buffer( ); my_function->initialize_gpu_buffer( ); for( int i = 0; i < profileCount; ++i ) { my_function->call_func( ); my_function->reset_gpu_write_buffer( ); } my_function->cleanup( ); //std::cout << "clSPARSE kernel execution time < ns >: " << my_function->time_in_ns( ) << std::endl; //std::cout << "clSPARSE kernel execution Gflops < " << // my_function->bandwidth_formula( ) << " >: " << my_function->bandwidth( ) << std::endl << std::endl; } } catch( std::exception& exc ) { std::cerr << exc.what( ) << std::endl; return 1; } FreeSharedLibrary( timerLibHandle ); return 0; }
int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size, clfftLayout in_layout, clfftLayout out_layout, clfftResultLocation place, clfftPrecision precision, clfftDirection dir, cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo, cl_uint command_queue_flags, cl_uint profile_count, std::auto_ptr< clfftSetupData > setupData ) { // Our command line does not specify what dimension FFT we wish to transform; we decode // this from the lengths that the user specifies for X, Y, Z. A length of one means that // The user does not want that dimension. const size_t max_dimensions = 3; size_t strides[ 4 ]; size_t o_strides[ 4 ]; size_t fftVectorSize = 0; size_t fftVectorSizePadded = 0; size_t fftBatchSize = 0; size_t outfftVectorSize = 0; size_t outfftVectorSizePadded = 0; size_t outfftBatchSize = 0; size_t size_of_input_buffers_in_bytes = 0; size_t size_of_output_buffers_in_bytes = 0; cl_uint number_of_output_buffers = 0; clfftDim dim = CLFFT_1D; cl_mem input_cl_mem_buffers [2] = { NULL, NULL }; cl_mem output_cl_mem_buffers[2] = { NULL, NULL }; std::vector< cl_device_id > device_id; cl_context context; cl_command_queue queue; cl_event outEvent = NULL; clfftPlanHandle plan_handle; for (unsigned u = 0; u < max_dimensions; ++u) { if (0 != lengths[u]) continue; lengths[u] = 1; } if( lengths[ 1 ] > 1 ) { dim = CLFFT_2D; } if( lengths[ 2 ] > 1 ) { dim = CLFFT_3D; } strides[ 0 ] = inStrides[0]; strides[ 1 ] = inStrides[1]; strides[ 2 ] = inStrides[2]; strides[ 3 ] = inStrides[3]; o_strides[ 0 ] = outStrides[0]; o_strides[ 1 ] = outStrides[1]; o_strides[ 2 ] = outStrides[2]; o_strides[ 3 ] = outStrides[3]; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; size_t Nt = 1 + lengths[0]/2; if(place == CLFFT_INPLACE) { outfftVectorSize = fftVectorSize; outfftVectorSizePadded = fftVectorSizePadded; outfftBatchSize = fftBatchSize; } else { outfftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; } // Real to complex case if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) ) { fftVectorSizePadded = strides[3]; fftBatchSize = fftVectorSizePadded * batch_size; outfftVectorSizePadded = o_strides[3]; outfftBatchSize = outfftVectorSizePadded * batch_size; fftVectorSize = lengths[0] * lengths[1] * lengths[2]; outfftVectorSize = fftVectorSize; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_COMPLEX_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_HERMITIAN_INTERLEAVED: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > ); break; case CLFFT_HERMITIAN_PLANAR: number_of_output_buffers = 2; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; case CLFFT_REAL: number_of_output_buffers = 1; size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T); break; } // Fill the input buffers switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; input[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_COMPLEX_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_INTERLEAVED: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< std::complex< T > > input( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { input[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; input[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_HERMITIAN_PLANAR: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); std::vector< T > imag( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; imag[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; real[p3] = static_cast<T>(outfftVectorSize); } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; case CLFFT_REAL: { // This call creates our openCL context and sets up our devices; expected to throw on error size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T ); device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo ); createOpenCLCommandQueue( context, command_queue_flags, queue, device_id, size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers, size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers); std::vector< T > real( fftBatchSize ); // set zero for( cl_uint i = 0; i < fftBatchSize; ++i ) { real[ i ] = 0; } // impulse test case for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * strides[0]; real[p0] = 1; } } } } OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, &outEvent ), "clEnqueueWriteBuffer failed" ); } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } // Discover and load the timer module if present void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false ); if( timerLibHandle == NULL ) { terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl; } // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) ); // Create and initialize our timer class, if the external timer shared library loaded baseStatTimer* timer = NULL; size_t clFFTID = 0; if( get_timer ) { timer = get_timer( CLFFT_GPU ); timer->Reserve( 1, profile_count ); timer->setNormalize( true ); clFFTID = timer->getUniqueID( "clFFT", 0 ); } OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" ); OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" ); // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" ); OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" ); OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" ); OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" ); OPENCL_V_THROW (clfftSetPlanInStride ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" ); OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" ); OPENCL_V_THROW (clfftSetPlanDistance ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" ); // Set backward scale factor to 1.0 for non real FFTs to do correct output checks if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL) OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" ); OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" ); //get the buffersize size_t buffersize=0; OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (buffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" ); } switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize input layout return CLFFT_INVALID_ARG_VALUE; } switch( out_layout ) { case CLFFT_COMPLEX_INTERLEAVED: case CLFFT_COMPLEX_PLANAR: case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_HERMITIAN_PLANAR: case CLFFT_REAL: break; default: // Don't recognize output layout return CLFFT_INVALID_ARG_VALUE; } if (( place == CLFFT_INPLACE ) && ( in_layout != out_layout )) { switch( in_layout ) { case CLFFT_COMPLEX_INTERLEAVED: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_COMPLEX_PLANAR: { if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) ) { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); } break; } case CLFFT_HERMITIAN_INTERLEAVED: { if( out_layout != CLFFT_REAL ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } case CLFFT_HERMITIAN_PLANAR: { throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } case CLFFT_REAL: { if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) ) { throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); } break; } } } // Loop as many times as the user specifies to average out the timings // cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ]; Timer tr; tr.Start(); for( cl_uint i = 0; i < profile_count; ++i ) { if( timer ) timer->Start( clFFTID ); OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent, &input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ), "clfftEnqueueTransform failed" ); if( timer ) timer->Stop( clFFTID ); } OPENCL_V_THROW( clFinish( queue ), "clFinish failed" ); if(clMedBuffer) clReleaseMemObject(clMedBuffer); double wtime = tr.Sample()/((double)profile_count); size_t totalLen = 1; for(int i=0; i<dim; i++) totalLen *= lengths[i]; double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0); if(profile_count > 1) { tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl; tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl; } if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) ) { // Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result timer->pruneOutliers( 2.0 ); timer->Print( ); timer->Reset( ); } /*****************/ FreeSharedLibrary( timerLibHandle ); // Read and check output data // This check is not valid if the FFT is executed multiple times inplace. // if (( place == CLFFT_OUTOFPLACE ) || ( profile_count == 1)) { bool checkflag= false; switch( out_layout ) { case CLFFT_HERMITIAN_INTERLEAVED: case CLFFT_COMPLEX_INTERLEAVED: { std::vector< std::complex< T > > output( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } //check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (output[i].real() != outfftVectorSize) { checkflag = true; break; } } else { if (output[ i ].real() != 0) { checkflag = true; break; } } if (output[ i ].imag() != 0) { checkflag = true; break; } } } break; case CLFFT_HERMITIAN_PLANAR: case CLFFT_COMPLEX_PLANAR: { std::valarray< T > real( outfftBatchSize ); std::valarray< T > imag( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } // Check output data for( cl_uint i = 0; i < outfftBatchSize; ++i ) { if (0 == (i % outfftVectorSizePadded)) { if (real[i] != outfftVectorSize) { checkflag = true; break; } } else { if (real[i] != 0) { checkflag = true; break; } } if (imag[i] != 0) { checkflag = true; break; } } } break; case CLFFT_REAL: { std::valarray< T > real( outfftBatchSize ); if( place == CLFFT_INPLACE ) { OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } else { OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ], 0, NULL, NULL ), "Reading the result buffer failed" ); } ////check output data for(size_t b = 0; b < batch_size; b++) { size_t p3 = b * o_strides[3]; for(size_t k = 0; k < lengths[2]; k++) { size_t p2 = p3 + k * o_strides[2]; for(size_t j = 0; j < lengths[1]; j++) { size_t p1 = p2 + j * o_strides[1]; for(size_t i = 0; i < lengths[0]; i++) { size_t p0 = p1 + i * o_strides[0]; if (real[p0] != 1) { checkflag = true; break; } } } } } } break; default: { throw std::runtime_error( "Input layout format not yet supported" ); } break; } if (checkflag) { std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl; } else { std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl; } } OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" ); OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" ); cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent ); return 0; }