예제 #1
0
파일: Kernels.cpp 프로젝트: sasaki3/clLucas
int
Lucas::fft_setup (int length)
{
  clAmdFftSetupData fftSetupData;
  OPENCL_V_THROW(clAmdFftInitSetupData (&fftSetupData),"Failed to clAmdFftInitSetupData.");
  fftSetupData.debugFlags = CLFFT_DUMP_PROGRAMS;        // Dumps the FFT kernels
  // Setup the AMD FFT library.
  OPENCL_V_THROW(clAmdFftSetup (&fftSetupData),"Failed to clAmdFftSetup.");
  // Create FFT Plan
  const size_t logicalDimensions[1] = { length };
  // Create default plan.
  OPENCL_V_THROW(clAmdFftCreateDefaultPlan(&plan, context(), CLFFT_1D, logicalDimensions),"Failed to clAmdFftCreateDefaultPlan.");
  // Set double precision.
  OPENCL_V_THROW(clAmdFftSetPlanPrecision(plan, CLFFT_DOUBLE),"Failed to clAmdFftSetPlanPrecision.");
  // Set layout.
  OPENCL_V_THROW(clAmdFftSetLayout(plan, CLFFT_COMPLEX_INTERLEAVED,CLFFT_COMPLEX_INTERLEAVED),"Failed to clAmdFftSetLayout.");
  // Normalize forward transformation.
  OPENCL_V_THROW(clAmdFftSetPlanScale(plan, CLFFT_FORWARD, 1.0f/static_cast<cl_float>(length)),"Failed to clAmdFftSetPlanScale.");
  // Normalize backward transformation.
  OPENCL_V_THROW(clAmdFftSetPlanScale(plan, CLFFT_BACKWARD, 1.0f),"Failed to clAmdFftSetPlanScale.");
  // In-place FFT.
  OPENCL_V_THROW(clAmdFftSetResultLocation(plan, CLFFT_INPLACE),"Failed to clAmdFftSetResultLocation.");
  // Set number of transformations per plan.
  OPENCL_V_THROW(clAmdFftSetPlanBatchSize(plan, 1),"Failed to clAmdFftSetPlanBatchSize.");
  // BakePlan
  OPENCL_V_THROW(clAmdFftBakePlan(plan, 1, &commandQueue(), NULL, NULL),"Failed to clAmdFftBakePlan.");
}
예제 #2
0
void AllocateFFTFramework(int *Error){
#ifndef NO_OPENCL
	std::auto_ptr< clAmdFftSetupData > setupData( new clAmdFftSetupData );
	*Error = clAmdFftInitSetupData( setupData.get( ));

	if(*Error != 0){
		*Error = clLabviewDevice::Error(OPENCLV_CANT_INIT_FFT_FRAMEWORK);
		return;
	}

	*Error = clAmdFftSetup( setupData.get( ) );

	if(*Error != 0){
		*Error = clLabviewDevice::Error(OPENCLV_CANT_INIT_FFT_FRAMEWORK);
		return;
	}
#endif
}
int main()
{
        //Control Variables
        bool showStartInput=false;// Setting it to true shows the original Input
        bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle
        bool showReshuffleOutput=false;// Shows the output after the reshuffle
        bool showFinalResult=false; // Shows final result after cross-correlation
        bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication
	bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling

	//openCL State
        cl_platform_id platform_id=NULL;
	cl_device_id device_id=NULL;
        cl_context context=NULL;
        cl_command_queue queue=NULL;
        cl_program program=NULL;
        cl_kernel kernel=NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret=0; // Stores the error values retuned by many functions
	cl_event event = NULL;
        cl_event events[10];
	cl_kernel clKernel;

	//FFT state
	clAmdFftPlanHandle plHandle;
        clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE
        clAmdFftLayout inLayout  = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftDim dim = CLFFT_1D;
	size_t clStrides[3]={0,0,0};
        size_t clLengths[3];
	clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft
	clLengths[1]=1;//length of second dimension of fft
	clLengths[2]=1;
	clStrides[ 0 ] = 1;
        clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ];
        clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ];
        clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ];
	size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously


	//Initialise openCL
	OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed");
        OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed");
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        OPENCL_V_THROW(ret, "Creating Context failed" );
        queue = clCreateCommandQueue(context, device_id, 0, &ret);
        OPENCL_V_THROW(ret, "Creating command queue failed" );


//===========Initialise the host buffers======================================

/*
 * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h
 */

	float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input

	float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling

        float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer

        if(showStartInput){
            cout << "Initial Input Buffer" << "\n";
            for(int j=0;j<NO_INPUTS;j++){
                for(int i=0;i<MEM_SIZE;i++){
                        cout << src_a_h[j][i] << " ";
                }cout << "\n";
            }printf("\n");
        }


//===================================================================
        //Calculation of facs for reshuffling
	complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2));
	complex<float> I=1.0i;
        complex <float> xx=2.0*PI;
        for(int i=0;i<MEM_SIZE/2;i++){
                facs_h[i]=(1.0*i)/(1.0*MEM_SIZE);
                facs_h[i]=exp(xx*(-I*facs_h[i]));
        }
//===================================================================
	//Initialise GPU memory buffers
	size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float);
	size_t sizeoffacsmem=MEM_SIZE*sizeof(float);
	cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret);
	OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" );

	cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret);
	OPENCL_V_THROW (ret, "Creating fft output Buffer failed");

	cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret);
	OPENCL_V_THROW (ret, "Creating facs Buffer failed");

	//===========================Starting the fft=============================//
	clAmdFftSetupData setupData;
	OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" );
        OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" );
	OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" );
	OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed");
        OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" );
	OPENCL_V_THROW (clAmdFftSetPlanInStride  ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanDistance  ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" );

	OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" );

	size_t tempbuffersize=0;
        OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" );

	//allocate the intermediate buffer
        cl_mem clMedBuffer=NULL;
        if (tempbuffersize)
        {
                cl_int medstatus;
                clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus);
                OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" );
        }
	if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) {
                switch( inLayout )
                {
                case CLFFT_COMPLEX_INTERLEAVED:
                        {
                                assert (CLFFT_COMPLEX_PLANAR == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
                                break;
                        }
                case CLFFT_COMPLEX_PLANAR:
                        {
                                assert (CLFFT_COMPLEX_INTERLEAVED == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
                                break;
                        }
                }
        }
	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL :  &clMemBuffersOut;

	//========Timimg fft============//

	double time_fft_start=omp_get_wtime();

	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" );
	}
	ret=clWaitForEvents(1,&event);

        double time_fft_end=omp_get_wtime();

	//Cleaning up fft
        OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" );
        OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" );

        //displaying results
        if(showFftOutput){
            OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" );

            cout << "**FFT Output**" << endl;
            for(int j=0;j<NO_INPUTS;j++){
                    for(int i=0;i<MEM_SIZE;i++){
                            cout << answer[j][i] << " ";
                    } printf("\n");
            }printf("\n");
        }
	//==================End of FFT=============================================//

	//==================Start the Reshuffling==================================//
	FILE *fp;
	char fileName[]="./reshuffle.cl";
	char* source_str=NULL;
	size_t source_size;

	//Load the source code containing the kernel/
 	fp = fopen(fileName, "r");
 	if (!fp) {
 		fprintf(stderr, "Failed to load reshuffle kernel.¥n");
	 	exit(1);
 	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	//Preparation for building the Kernel
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
        OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" );
	OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle");
	kernel = clCreateKernel(program, "reshuffle", &ret);
	OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" );

	//Set kernel parameters
	const int num=NO_INPUTS*MEM_SIZE;
	const int block=MEM_SIZE;
	OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed");


	// Execute OpenCL Kernel //
	const size_t local_ws=NO_INPUTS*MEM_SIZE;
	const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws);

	//===========timing the reshuffle===============//
	double time_reshuffle_start=omp_get_wtime();
	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed");
	}
	double time_reshuffle_end=omp_get_wtime();

	//Read back data
	OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed");

	//====================Finish the reshuffling================================//

        if(showReshuffleOutput){
                cout << "Output after reshuffling" << endl;
                for(int j=0;j<NO_INPUTS;j++){
                        for(int i=0;i<MEM_SIZE;i++){
                                cout << answer[j][i] << " ";
                        } printf("\n");
                }printf("\n");
        }
        //=================Reformatting the input given to the matrix multiply===================================//
        float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2);
        for(int i=0;i<NO_INPUTS;i++){
            for(int j=0;j<MEM_SIZE;j++){
                if(j&1)
                        answer_final[(i<<1)+1][j >> 1]=answer[i][j];
                else
                        answer_final[(i<<1)][j >> 1]=answer[i][j];
            }
        }