示例#1
0
bool clutl::Module::Load(clcpp::Database* host_db, const char* filename)
{
	// Load the DLL
	m_Handle = LoadSharedLibrary(filename);
	if (m_Handle == 0)
		return false;

	// Keep this around for registering interface implementations
	clcpp::internal::Assert(host_db != 0);
	m_HostReflectionDB = host_db;

	// Get the module reflection database
	typedef clcpp::Database* (*GetReflectionDatabaseFunc)();
	GetReflectionDatabaseFunc GetReflectionDatabase = (GetReflectionDatabaseFunc)GetSharedLibraryFunction(m_Handle, "GetReflectionDatabase");
	if (GetReflectionDatabase)
		m_ReflectionDB = GetReflectionDatabase();

	// Ask the DLL to register and interface implementations it has
	typedef void (*AddReflectionImplsFunc)(Module*);
	AddReflectionImplsFunc AddReflectionImpls = (AddReflectionImplsFunc)GetSharedLibraryFunction(m_Handle, "AddReflectionImpls");
	if (AddReflectionImpls)
		AddReflectionImpls(this);

	return true;
}
static inline CRuntimePtr CreateCRuntime (const pI_bool load_properties = pI_FALSE) {

	SharedLibrary* crtsl = 0;
	SharedLibrary* cppimplsl = 0;
	CreateCRuntimeFunc cfunc = 0;
	CRuntimePtr crt = 0;
	SharedLibrary** injectptr = 0;

	// load functionality from shared library
	crtsl = LoadSharedLibrary (pI_RUNTIME_MAKE_SHARED_LIB_NAME("pIRuntime"));
	if (crtsl == 0)
		return 0;

	cppimplsl = LoadSharedLibrary (pI_RUNTIME_MAKE_SHARED_LIB_NAME("pICppRuntimeImpl"));
	if (cppimplsl == 0) {
		UnloadSharedLibrary (crtsl);
		return 0;
	}

	// fetch runtime creation function from shared library
	cfunc = (CreateCRuntimeFunc) GetSharedLibraryFunction (crtsl, "CreateCRuntime");
	if (cfunc == 0) {
		UnloadSharedLibrary (crtsl);
		UnloadSharedLibrary (cppimplsl);
		return 0;
	}

	// instantiate runtime
	crt = cfunc (load_properties);
	if (crt == 0) {
		UnloadSharedLibrary (crtsl);
		UnloadSharedLibrary (cppimplsl);
		return 0;
	}

	// inject shared libraries into runtime
	injectptr = (SharedLibrary**) crt->AllocateMemory (crt, sizeof(SharedLibrary*) * 2);
	injectptr[0] = crtsl;
	injectptr[1] = cppimplsl;
	crt->shared_library_data = injectptr;
	return crt;
} // static inline CRuntimePtr CreateCRuntime (const pI_bool load_properties = pI_FALSE)
示例#3
0
clsparseControl
clsparseCreateControl( cl_command_queue queue, clsparseStatus *status )
{
    clsparseControl control = new _clsparseControl( queue );

    clsparseStatus err = clsparseSuccess;
    if( !control )
    {
        control = nullptr;
        err = clsparseOutOfHostMemory;
    }

    control->event = nullptr;
//    control->off_alpha = 0;
//    control->off_beta = 0;
//    control->off_x = 0;
//    control->off_y = 0;

    control->wavefront_size = 0;
    control->max_wg_size = 0;
    control->async = false;
    control->extended_precision = false;
    control->dpfp_support = false;

    collectEnvParams( control );

    //	Discover and load the timer module if present
    void* timerLibHandle = LoadSharedLibrary( "lib", "clsparseTimer", false );
    if( timerLibHandle )
    {
        //	Timer module discovered and loaded successfully
        //	Initialize function pointers to call into the shared module
        // PFCLSPARSETIMER pfclsparseTimer = static_cast<PFCLSPARSETIMER> ( LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" ) );
        void* funcPtr = LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" );
        PFCLSPARSETIMER pfclsparseTimer = *static_cast<PFCLSPARSETIMER*>( static_cast<void*>( &funcPtr ) );

        //	Create and initialize our timer class, if the external timer shared library loaded
        if( pfclsparseTimer )
        {
            control->pDeviceTimer = static_cast<clsparseDeviceTimer*> ( pfclsparseTimer( CLSPARSE_GPU ) );
        }
    }

    if( status != NULL )
    {
        *status = err;
    }

    return control;
}
示例#4
0
    Module * ModuleFactory::createModule(const char * path) const {
        Info << "Loading " << path;
        void *hDLL;
        //do not add extension. It is handled by LoadSharedLibrary.
        hDLL = LoadSharedLibrary(path);
        if (hDLL == 0)
            return NULL;
        Instance instanciate = (Instance)GetFunction(hDLL, "create");
        Destroy destroy = (Destroy)GetFunction(hDLL, "destroy");
        if (instanciate == 0 || destroy == 0)
            return NULL;

        Module * module = instanciate();
        module->setCleanLibraryCallback(new CleanLibraryCallBack(hDLL, destroy));

        return module;
    }
示例#5
0
int main( int argc, char *argv[ ] )
{
    cl_double alpha, beta;
    clsparseIdx_t rows, columns;
    size_t profileCount;
    std::string function;
    std::string precision;
    std::string root_dir;

    po::options_description desc( "clSPARSE bench command line options" );
    desc.add_options( )
        ( "help,h", "produces this help message" )
        ( "dirpath,d", po::value( &root_dir ), "Matrix directory" )
        ( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
        ( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
        ( "rows", po::value<clsparseIdx_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" )
        ( "columns", po::value<clsparseIdx_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" )
        ( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: "
                    "SpMdV, SpMdM, SpMSpM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" )
        ( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" )
        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" )
        ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" )
        ( "no_zeroes,z", po::bool_switch()->default_value(false), "Disable reading explicit zeroes from the input matrix market file.")
        ;

    po::variables_map vm;
    po::store( po::parse_command_line( argc, argv, desc ), vm );
    po::notify( vm );

    if( vm.count( "help" ) )
    {
        std::cout << desc << std::endl;
        return 0;
    }

    if( precision != "s" && precision != "d" ) // && precision != "c" && precision != "z" )
    {
        std::cerr << "Invalid value for --precision" << std::endl;
        return -1;
    }

    if( vm.count( "dirpath" ) == 0 )
    {
        std::cerr << "The [" << "root" << "] parameter is missing!" << std::endl;
        std::cerr << desc << std::endl;
        return false;
    }

    //	Discover and load the timer module if present
    void* timerLibHandle = LoadSharedLibrary( "lib", "clsparseTimer", false );
    if( timerLibHandle == NULL )
    {
        std::cerr << "Could not find the external timing library; timings disabled" << std::endl;
    }

    cl_bool extended_precision = false;
    if (vm["extended"].as<bool>())
        extended_precision = true;
    cl_bool explicit_zeroes = true;
    if (vm["no_zeroes"].as<bool>())
        explicit_zeroes = false;

    //	Timer module discovered and loaded successfully
    //	Initialize function pointers to call into the shared module
    void* funcPtr = LoadFunctionAddr( timerLibHandle, "clsparseGetTimer" );
    PFCLSPARSETIMER sparseGetTimer = *static_cast<PFCLSPARSETIMER*>( static_cast<void*>( &funcPtr ) );

    std::unique_ptr< clsparseFunc > my_function;
    if( boost::iequals( function, "SpMdV" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else if( precision == "d" )
            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
        {
            std::cerr << "Unknown spmdv precision" << std::endl;
            return -1;
        }
    }
    else if( boost::iequals( function, "CG" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xCG< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xCG< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }

    else if( boost::iequals( function, "BiCGStab" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xBiCGStab< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xBiCGStab< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else if( boost::iequals( function, "SpMdM" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xSpMdM< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, columns, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xSpMdM< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, columns, explicit_zeroes ) );
    }
    else if (boost::iequals(function, "SpMSpM"))
    {
        if (precision == "s")
            my_function = std::unique_ptr< clsparseFunc>(new xSpMSpM< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >(new xSpMSpM< cl_double >(sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else if( boost::iequals( function, "Coo2Csr" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xCoo2Csr< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xCoo2Csr< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else if( boost::iequals( function, "Dense2Csr" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xDense2Csr< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xDense2Csr< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else if( boost::iequals( function, "Csr2Dense" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xCsr2Dense< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xCsr2Dense< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else if( boost::iequals( function, "Csr2Coo" ) )
    {
        if( precision == "s" )
            my_function = std::unique_ptr< clsparseFunc >( new xCsr2Coo< cl_float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
        else
            my_function = std::unique_ptr< clsparseFunc >( new xCsr2Coo< cl_double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU, explicit_zeroes ) );
    }
    else
    {
        std::cerr << "Benchmarking unknown function" << std::endl;
        return -1;
    }

    try
    {
        std::vector< fs::path > matrix_files = enumMatrices( root_dir );

        for( auto& file : matrix_files )
        {
            std::string path = file.string( );
            try {
                my_function->setup_buffer( alpha, beta, path );
            }
            // I expect to catch trow from clsparseHeaderfromFile
            // If io_exception then we don't need to cleanup.
            // If runtime_exception is catched we are doomed!
            catch( clsparse::io_exception& io_exc )
            {
                std::cout << io_exc.what( ) << std::endl;
                continue;
            }
            my_function->initialize_cpu_buffer( );
            my_function->initialize_gpu_buffer( );

            for( int i = 0; i < profileCount; ++i )
            {
                my_function->call_func( );
                my_function->reset_gpu_write_buffer( );
            }
            my_function->cleanup( );

            //std::cout << "clSPARSE kernel execution time < ns >: " << my_function->time_in_ns( ) << std::endl;
            //std::cout << "clSPARSE kernel execution Gflops < " <<
            //    my_function->bandwidth_formula( ) << " >: " << my_function->bandwidth( ) << std::endl << std::endl;
        }
    }
    catch( std::exception& exc )
    {
        std::cerr << exc.what( ) << std::endl;
        return 1;
    }

    FreeSharedLibrary( timerLibHandle );

    return 0;
}
示例#6
0
int transform( size_t* lengths, const size_t *inStrides, const size_t *outStrides, size_t batch_size,
				clfftLayout in_layout, clfftLayout out_layout,
				clfftResultLocation place, clfftPrecision precision, clfftDirection dir,
				cl_device_type deviceType, cl_int deviceId, cl_int platformId, bool printInfo,
				cl_uint command_queue_flags, cl_uint profile_count,
				std::auto_ptr< clfftSetupData > setupData )
{
	//	Our command line does not specify what dimension FFT we wish to transform; we decode
	//	this from the lengths that the user specifies for X, Y, Z.  A length of one means that
	//	The user does not want that dimension.

	const size_t max_dimensions = 3;
	size_t strides[ 4 ];
	size_t o_strides[ 4 ];
	size_t fftVectorSize = 0;
	size_t fftVectorSizePadded = 0;
	size_t fftBatchSize = 0;
	size_t outfftVectorSize = 0;
	size_t outfftVectorSizePadded = 0;
	size_t outfftBatchSize = 0;
	size_t size_of_input_buffers_in_bytes = 0;
	size_t size_of_output_buffers_in_bytes = 0;
	cl_uint number_of_output_buffers = 0;
	clfftDim	dim = CLFFT_1D;
	cl_mem input_cl_mem_buffers [2] = { NULL, NULL };
	cl_mem output_cl_mem_buffers[2] = { NULL, NULL };
	std::vector< cl_device_id > device_id;
	cl_context context;
	cl_command_queue queue;
	cl_event outEvent = NULL;
	clfftPlanHandle plan_handle;

	for (unsigned u = 0; u < max_dimensions; ++u) {
		if (0 != lengths[u])
			continue;
		lengths[u] = 1;
	}

	if( lengths[ 1 ] > 1 )
	{
		dim	= CLFFT_2D;
	}
	if( lengths[ 2 ] > 1 )
	{
		dim	= CLFFT_3D;
	}

	strides[ 0 ] = inStrides[0];
	strides[ 1 ] = inStrides[1];
	strides[ 2 ] = inStrides[2];
	strides[ 3 ] = inStrides[3];

	o_strides[ 0 ] = outStrides[0];
	o_strides[ 1 ] = outStrides[1];
	o_strides[ 2 ] = outStrides[2];
	o_strides[ 3 ] = outStrides[3];

	fftVectorSize = lengths[0] * lengths[1] * lengths[2];
	fftVectorSizePadded = strides[3];
	fftBatchSize = fftVectorSizePadded * batch_size;

	size_t Nt = 1 + lengths[0]/2;

	if(place == CLFFT_INPLACE)
	{
		outfftVectorSize = fftVectorSize;
		outfftVectorSizePadded = fftVectorSizePadded;
		outfftBatchSize = fftBatchSize;
	}
	else
	{
		outfftVectorSize = lengths[0] * lengths[1] * lengths[2];
		outfftVectorSizePadded = o_strides[3];
		outfftBatchSize = outfftVectorSizePadded * batch_size;
	}


	// Real to complex case
	if( (in_layout == CLFFT_REAL) || (out_layout == CLFFT_REAL) )
	{
		fftVectorSizePadded = strides[3];
		fftBatchSize = fftVectorSizePadded * batch_size;

		outfftVectorSizePadded = o_strides[3];
		outfftBatchSize = outfftVectorSizePadded * batch_size;

		fftVectorSize = lengths[0] * lengths[1] * lengths[2];
		outfftVectorSize = fftVectorSize;

	}


	switch( out_layout )
	{
	case CLFFT_COMPLEX_INTERLEAVED:
		number_of_output_buffers = 1;
		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
		break;
	case CLFFT_COMPLEX_PLANAR:
		number_of_output_buffers = 2;
		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
		break;
	case CLFFT_HERMITIAN_INTERLEAVED:
		number_of_output_buffers = 1;
		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof( std::complex< T > );
		break;
	case CLFFT_HERMITIAN_PLANAR:
		number_of_output_buffers = 2;
		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
		break;
	case CLFFT_REAL:
		number_of_output_buffers = 1;
		size_of_output_buffers_in_bytes = outfftBatchSize * sizeof(T);
		break;
	}

	// Fill the input buffers
	switch( in_layout )
	{
	case CLFFT_COMPLEX_INTERLEAVED:
		{
			//	This call creates our openCL context and sets up our devices; expected to throw on error
			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );

			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
			createOpenCLCommandQueue( context,
				command_queue_flags, queue,
				device_id,
				size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);

			std::vector< std::complex< T > > input( fftBatchSize );

			// set zero
			for( cl_uint i = 0; i < fftBatchSize; ++i )
			{
				input[ i ] = 0;
			}

			// impulse test case
			for(size_t b = 0; b < batch_size; b++)
			{
				size_t p3 = b * strides[3];
				for(size_t k = 0; k < lengths[2]; k++)
				{
					size_t p2 = p3 + k * strides[2];
					for(size_t j = 0; j < lengths[1]; j++)
					{
						size_t p1 = p2 + j * strides[1];
						for(size_t i = 0; i < lengths[0]; i++)
						{
							size_t p0 = p1 + i * strides[0];
							input[p0] = 1;
						}
					}
				}
			}


			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );

		}
		break;
	case CLFFT_COMPLEX_PLANAR:
		{
			//	This call creates our openCL context and sets up our devices; expected to throw on error
			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );

			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
			createOpenCLCommandQueue( context,
				command_queue_flags, queue,
				device_id,
				size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);

			std::vector< T > real( fftBatchSize );
			std::vector< T > imag( fftBatchSize );

			// set zero
			for( cl_uint i = 0; i < fftBatchSize; ++i )
			{
				real[ i ] = 0;
				imag[ i ] = 0;
			}

			// impulse test case
			for(size_t b = 0; b < batch_size; b++)
			{
				size_t p3 = b * strides[3];
				for(size_t k = 0; k < lengths[2]; k++)
				{
					size_t p2 = p3 + k * strides[2];
					for(size_t j = 0; j < lengths[1]; j++)
					{
						size_t p1 = p2 + j * strides[1];
						for(size_t i = 0; i < lengths[0]; i++)
						{
							size_t p0 = p1 + i * strides[0];
							real[p0] = 1;
						}
					}
				}
			}


			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
		}
		break;
	case CLFFT_HERMITIAN_INTERLEAVED:
		{
			//	This call creates our openCL context and sets up our devices; expected to throw on error
			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( std::complex< T > );

			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
			createOpenCLCommandQueue( context,
				command_queue_flags, queue,
				device_id,
				size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);

			std::vector< std::complex< T > > input( fftBatchSize );

			// set zero
			for( cl_uint i = 0; i < fftBatchSize; ++i )
			{
				input[ i ] = 0;
			}

			// impulse test case
			for(size_t b = 0; b < batch_size; b++)
			{
				size_t p3 = b * strides[3];
				input[p3] = static_cast<T>(outfftVectorSize);

			}


			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &input[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
		}
		break;
	case CLFFT_HERMITIAN_PLANAR:
		{
			//	This call creates our openCL context and sets up our devices; expected to throw on error
			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );

			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
			createOpenCLCommandQueue( context,
				command_queue_flags, queue,
				device_id,
				size_of_input_buffers_in_bytes, 2, input_cl_mem_buffers,
				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);

			std::vector< T > real( fftBatchSize );
			std::vector< T > imag( fftBatchSize );

			// set zero
			for( cl_uint i = 0; i < fftBatchSize; ++i )
			{
				real[ i ] = 0;
				imag[ i ] = 0;
			}

			// impulse test case
			for(size_t b = 0; b < batch_size; b++)
			{
				size_t p3 = b * strides[3];
				real[p3] = static_cast<T>(outfftVectorSize);
			}



			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
		}
		break;
	case CLFFT_REAL:
		{
			//	This call creates our openCL context and sets up our devices; expected to throw on error
			size_of_input_buffers_in_bytes = fftBatchSize * sizeof( T );

			device_id = initializeCL( deviceType, deviceId, platformId, context, printInfo );
			createOpenCLCommandQueue( context,
				command_queue_flags, queue,
				device_id,
				size_of_input_buffers_in_bytes, 1, input_cl_mem_buffers,
				size_of_output_buffers_in_bytes, number_of_output_buffers, output_cl_mem_buffers);

			std::vector< T > real( fftBatchSize );

			// set zero
			for( cl_uint i = 0; i < fftBatchSize; ++i )
			{
				real[ i ] = 0;
			}

			// impulse test case
			for(size_t b = 0; b < batch_size; b++)
			{
				size_t p3 = b * strides[3];
				for(size_t k = 0; k < lengths[2]; k++)
				{
					size_t p2 = p3 + k * strides[2];
					for(size_t j = 0; j < lengths[1]; j++)
					{
						size_t p1 = p2 + j * strides[1];
						for(size_t i = 0; i < lengths[0]; i++)
						{
							size_t p0 = p1 + i * strides[0];
							real[p0] = 1;
						}
					}
				}
			}


			OPENCL_V_THROW( clEnqueueWriteBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
				0, NULL, &outEvent ),
				"clEnqueueWriteBuffer failed" );
		}
		break;
	default:
		{
			throw std::runtime_error( "Input layout format not yet supported" );
		}
		break;
	}

	//	Discover and load the timer module if present
	void* timerLibHandle = LoadSharedLibrary( "lib", "StatTimer", false );
	if( timerLibHandle == NULL )
	{
		terr << _T( "Could not find the external timing library; timings disabled" ) << std::endl;
	}


	//	Timer module discovered and loaded successfully
	//	Initialize function pointers to call into the shared module
	PFGETSTATTIMER get_timer = reinterpret_cast< PFGETSTATTIMER > ( LoadFunctionAddr( timerLibHandle, "getStatTimer" ) );

	//	Create and initialize our timer class, if the external timer shared library loaded
	baseStatTimer* timer = NULL;
	size_t	clFFTID = 0;
	if( get_timer )
	{
		timer = get_timer( CLFFT_GPU );
		timer->Reserve( 1, profile_count );
		timer->setNormalize( true );

		clFFTID	= timer->getUniqueID( "clFFT", 0 );
	}

	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, lengths ), "clfftCreateDefaultPlan failed" );

	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
	OPENCL_V_THROW( clfftSetLayout( plan_handle, in_layout, out_layout ), "clfftSetLayout failed" );
	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batch_size ), "clfftSetPlanBatchSize failed" );
	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );

	OPENCL_V_THROW (clfftSetPlanInStride  ( plan_handle, dim, strides ), "clfftSetPlanInStride failed" );
	OPENCL_V_THROW (clfftSetPlanOutStride ( plan_handle, dim, o_strides ), "clfftSetPlanOutStride failed" );
	OPENCL_V_THROW (clfftSetPlanDistance  ( plan_handle, strides[ 3 ], o_strides[ 3 ]), "clfftSetPlanDistance failed" );

	// Set backward scale factor to 1.0 for non real FFTs to do correct output checks
	if(dir == CLFFT_BACKWARD && in_layout != CLFFT_REAL && out_layout != CLFFT_REAL)
		OPENCL_V_THROW (clfftSetPlanScale( plan_handle, CLFFT_BACKWARD, (cl_float)1.0f ), "clfftSetPlanScale failed" );

	OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &queue, NULL, NULL ), "clfftBakePlan failed" );

	//get the buffersize
	size_t buffersize=0;
	OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );

	//allocate the intermediate buffer
	cl_mem clMedBuffer=NULL;

	if (buffersize)
	{
		cl_int medstatus;
		clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
	}

	switch( in_layout )
	{
	case CLFFT_COMPLEX_INTERLEAVED:
	case CLFFT_COMPLEX_PLANAR:
	case CLFFT_HERMITIAN_INTERLEAVED:
	case CLFFT_HERMITIAN_PLANAR:
	case CLFFT_REAL:
		break;
	default:
		//	Don't recognize input layout
		return CLFFT_INVALID_ARG_VALUE;
	}

	switch( out_layout )
	{
	case CLFFT_COMPLEX_INTERLEAVED:
	case CLFFT_COMPLEX_PLANAR:
	case CLFFT_HERMITIAN_INTERLEAVED:
	case CLFFT_HERMITIAN_PLANAR:
	case CLFFT_REAL:
		break;
	default:
		//	Don't recognize output layout
		return CLFFT_INVALID_ARG_VALUE;
	}

	if (( place == CLFFT_INPLACE )
	&&  ( in_layout != out_layout )) {
		switch( in_layout )
		{
		case CLFFT_COMPLEX_INTERLEAVED:
			{
				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
				{
					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
				}
				break;
			}
		case CLFFT_COMPLEX_PLANAR:
			{
				if( (out_layout == CLFFT_COMPLEX_INTERLEAVED) || (out_layout == CLFFT_HERMITIAN_INTERLEAVED) )
				{
					throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
				}
				break;
			}
		case CLFFT_HERMITIAN_INTERLEAVED:
			{
				if( out_layout != CLFFT_REAL )
				{
					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
				}
				break;
			}
		case CLFFT_HERMITIAN_PLANAR:
			{
				throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
				break;
			}
		case CLFFT_REAL:
			{
				if( (out_layout == CLFFT_COMPLEX_PLANAR) || (out_layout == CLFFT_HERMITIAN_PLANAR) )
				{
					throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
				}
				break;
			}
		}
	}

	//	Loop as many times as the user specifies to average out the timings
	//
	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &output_cl_mem_buffers[ 0 ];

	Timer tr;
	tr.Start();

	for( cl_uint i = 0; i < profile_count; ++i )
	{
		if( timer ) timer->Start( clFFTID );

		OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &queue, 0, NULL, &outEvent,
			&input_cl_mem_buffers[ 0 ], BuffersOut, clMedBuffer ),
			"clfftEnqueueTransform failed" );

		if( timer ) timer->Stop( clFFTID );
	}
	OPENCL_V_THROW( clFinish( queue ), "clFinish failed" );
	if(clMedBuffer) clReleaseMemObject(clMedBuffer);

	double wtime = tr.Sample()/((double)profile_count);
	size_t totalLen = 1;
	for(int i=0; i<dim; i++) totalLen *= lengths[i];
	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);

	if(profile_count > 1)
	{
		tout << "\nExecution wall time: " << 1000.0*wtime << " ms" << std::endl;
		tout << "Execution gflops: " << ((double)batch_size * opsconst)/(1000000000.0*wtime) << std::endl;
	}

	if( timer && (command_queue_flags & CL_QUEUE_PROFILING_ENABLE) )
	{
		//	Remove all timings that are outside of 2 stddev (keep 65% of samples); we ignore outliers to get a more consistent result
		timer->pruneOutliers( 2.0 );
		timer->Print( );
		timer->Reset( );
	}

	/*****************/
	FreeSharedLibrary( timerLibHandle );

	// Read and check output data
	// This check is not valid if the FFT is executed multiple times inplace.
	//
	if (( place == CLFFT_OUTOFPLACE )
	||  ( profile_count == 1))
	{
		bool checkflag= false;
		switch( out_layout )
		{
		case CLFFT_HERMITIAN_INTERLEAVED:
		case CLFFT_COMPLEX_INTERLEAVED:
			{
				std::vector< std::complex< T > > output( outfftBatchSize );

				if( place == CLFFT_INPLACE )
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &output[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}
				else
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &output[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}

				//check output data
				for( cl_uint i = 0; i < outfftBatchSize; ++i )
				{
					if (0 == (i % outfftVectorSizePadded))
					{
						if (output[i].real() != outfftVectorSize)
						{
							checkflag = true;
							break;
						}
					}
					else
					{
						if (output[ i ].real() != 0)
						{
							checkflag = true;
							break;
						}
					}

					if (output[ i ].imag() != 0)
					{
						checkflag = true;
						break;
					}
				}
			}
			break;
		case CLFFT_HERMITIAN_PLANAR:
		case CLFFT_COMPLEX_PLANAR:
			{
				std::valarray< T > real( outfftBatchSize );
				std::valarray< T > imag( outfftBatchSize );

				if( place == CLFFT_INPLACE )
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 1 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &imag[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}
				else
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 1 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &imag[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}

				//  Check output data
				for( cl_uint i = 0; i < outfftBatchSize; ++i )
				{
					if (0 == (i % outfftVectorSizePadded))
					{
						if (real[i] != outfftVectorSize)
						{
							checkflag = true;
							break;
						}
					}
					else
					{
						if (real[i] != 0)
						{
							checkflag = true;
							break;
						}
					}

					if (imag[i] != 0)
					{
						checkflag = true;
						break;
					}
				}
			}
			break;
		case CLFFT_REAL:
			{
				std::valarray< T > real( outfftBatchSize );

				if( place == CLFFT_INPLACE )
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, input_cl_mem_buffers[ 0 ], CL_TRUE, 0, size_of_input_buffers_in_bytes, &real[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}
				else
				{
					OPENCL_V_THROW( clEnqueueReadBuffer( queue, BuffersOut[ 0 ], CL_TRUE, 0, size_of_output_buffers_in_bytes, &real[ 0 ],
						0, NULL, NULL ),
						"Reading the result buffer failed" );
				}

				////check output data

				for(size_t b = 0; b < batch_size; b++)
				{
					size_t p3 = b * o_strides[3];
					for(size_t k = 0; k < lengths[2]; k++)
					{
						size_t p2 = p3 + k * o_strides[2];
						for(size_t j = 0; j < lengths[1]; j++)
						{
							size_t p1 = p2 + j * o_strides[1];
							for(size_t i = 0; i < lengths[0]; i++)
							{
								size_t p0 = p1 + i * o_strides[0];

								if (real[p0] != 1)
								{
									checkflag = true;
									break;
								}

							}
						}
					}
				}
			}
			break;
		default:
			{
				throw std::runtime_error( "Input layout format not yet supported" );
			}
			break;
		}

		if (checkflag)
		{
			std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
		}
		else
		{
			std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
		}
	}

	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
	OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );

	cleanupCL( &context, &queue, countOf( input_cl_mem_buffers ), input_cl_mem_buffers, countOf( output_cl_mem_buffers ), output_cl_mem_buffers, &outEvent );
	return 0;
}