int main()
{
        //Control Variables
        bool showStartInput=false;// Setting it to true shows the original Input
        bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle
        bool showReshuffleOutput=false;// Shows the output after the reshuffle
        bool showFinalResult=false; // Shows final result after cross-correlation
        bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication
	bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling

	//openCL State
        cl_platform_id platform_id=NULL;
	cl_device_id device_id=NULL;
        cl_context context=NULL;
        cl_command_queue queue=NULL;
        cl_program program=NULL;
        cl_kernel kernel=NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret=0; // Stores the error values retuned by many functions
	cl_event event = NULL;
        cl_event events[10];
	cl_kernel clKernel;

	//FFT state
	clAmdFftPlanHandle plHandle;
        clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE
        clAmdFftLayout inLayout  = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED;
        clAmdFftDim dim = CLFFT_1D;
	size_t clStrides[3]={0,0,0};
        size_t clLengths[3];
	clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft
	clLengths[1]=1;//length of second dimension of fft
	clLengths[2]=1;
	clStrides[ 0 ] = 1;
        clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ];
        clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ];
        clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ];
	size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously


	//Initialise openCL
	OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed");
        OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed");
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        OPENCL_V_THROW(ret, "Creating Context failed" );
        queue = clCreateCommandQueue(context, device_id, 0, &ret);
        OPENCL_V_THROW(ret, "Creating command queue failed" );


//===========Initialise the host buffers======================================

/*
 * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h
 */

	float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input

	float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling

        float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer

        if(showStartInput){
            cout << "Initial Input Buffer" << "\n";
            for(int j=0;j<NO_INPUTS;j++){
                for(int i=0;i<MEM_SIZE;i++){
                        cout << src_a_h[j][i] << " ";
                }cout << "\n";
            }printf("\n");
        }


//===================================================================
        //Calculation of facs for reshuffling
	complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2));
	complex<float> I=1.0i;
        complex <float> xx=2.0*PI;
        for(int i=0;i<MEM_SIZE/2;i++){
                facs_h[i]=(1.0*i)/(1.0*MEM_SIZE);
                facs_h[i]=exp(xx*(-I*facs_h[i]));
        }
//===================================================================
	//Initialise GPU memory buffers
	size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float);
	size_t sizeoffacsmem=MEM_SIZE*sizeof(float);
	cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret);
	OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" );

	cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret);
	OPENCL_V_THROW (ret, "Creating fft output Buffer failed");

	cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret);
	OPENCL_V_THROW (ret, "Creating facs Buffer failed");

	//===========================Starting the fft=============================//
	clAmdFftSetupData setupData;
	OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" );
        OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" );
	OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" );
	OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed");
        OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" );
	OPENCL_V_THROW (clAmdFftSetPlanInStride  ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" );
        OPENCL_V_THROW (clAmdFftSetPlanDistance  ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" );

	OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" );

	size_t tempbuffersize=0;
        OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" );

	//allocate the intermediate buffer
        cl_mem clMedBuffer=NULL;
        if (tempbuffersize)
        {
                cl_int medstatus;
                clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus);
                OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" );
        }
	if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) {
                switch( inLayout )
                {
                case CLFFT_COMPLEX_INTERLEAVED:
                        {
                                assert (CLFFT_COMPLEX_PLANAR == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" );
                                break;
                        }
                case CLFFT_COMPLEX_PLANAR:
                        {
                                assert (CLFFT_COMPLEX_INTERLEAVED == outLayout);
                                throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" );
                                break;
                        }
                }
        }
	cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL :  &clMemBuffersOut;

	//========Timimg fft============//

	double time_fft_start=omp_get_wtime();

	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" );
	}
	ret=clWaitForEvents(1,&event);

        double time_fft_end=omp_get_wtime();

	//Cleaning up fft
        OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" );
        OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" );

        //displaying results
        if(showFftOutput){
            OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" );

            cout << "**FFT Output**" << endl;
            for(int j=0;j<NO_INPUTS;j++){
                    for(int i=0;i<MEM_SIZE;i++){
                            cout << answer[j][i] << " ";
                    } printf("\n");
            }printf("\n");
        }
	//==================End of FFT=============================================//

	//==================Start the Reshuffling==================================//
	FILE *fp;
	char fileName[]="./reshuffle.cl";
	char* source_str=NULL;
	size_t source_size;

	//Load the source code containing the kernel/
 	fp = fopen(fileName, "r");
 	if (!fp) {
 		fprintf(stderr, "Failed to load reshuffle kernel.¥n");
	 	exit(1);
 	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	//Preparation for building the Kernel
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
        OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" );
	OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle");
	kernel = clCreateKernel(program, "reshuffle", &ret);
	OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" );

	//Set kernel parameters
	const int num=NO_INPUTS*MEM_SIZE;
	const int block=MEM_SIZE;
	OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed");
 	OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed");
        OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed");


	// Execute OpenCL Kernel //
	const size_t local_ws=NO_INPUTS*MEM_SIZE;
	const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws);

	//===========timing the reshuffle===============//
	double time_reshuffle_start=omp_get_wtime();
	for(int i=0;i<ITER_FFT;i++){
		OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed");
	}
	double time_reshuffle_end=omp_get_wtime();

	//Read back data
	OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed");

	//====================Finish the reshuffling================================//

        if(showReshuffleOutput){
                cout << "Output after reshuffling" << endl;
                for(int j=0;j<NO_INPUTS;j++){
                        for(int i=0;i<MEM_SIZE;i++){
                                cout << answer[j][i] << " ";
                        } printf("\n");
                }printf("\n");
        }
        //=================Reformatting the input given to the matrix multiply===================================//
        float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2);
        for(int i=0;i<NO_INPUTS;i++){
            for(int j=0;j<MEM_SIZE;j++){
                if(j&1)
                        answer_final[(i<<1)+1][j >> 1]=answer[i][j];
                else
                        answer_final[(i<<1)][j >> 1]=answer[i][j];
            }
        }
Exemple #2
0
clAmdFftPlanHandle CreateFFTPlan(clLabviewDevice *d, 
					int FFTType, int Dimension,  
					int Width, int Height, int Depth,
					int StrideW, int StrideH, int StrideD, int StrideT, 
					int PaddingW, int PaddingH, int PaddingD, 
					size_t *OutputWidthFloat, int SingleOrDouble, int *Error){
	
	clAmdFftPlanHandle	plHandle = 0;

#ifndef NO_OPENCL
	*Error = clLabviewDevice::Error(clLabviewDevice::SanitizeDevice(d));
	if(*Error != 0)
		return NULL;					
						
	clAmdFftResultLocation	place = CLFFT_OUTOFPLACE;       //INPLACE NOT SUPPORTED
	size_t clStridesIn[ 4 ];
	size_t clStridesOut[ 4 ];
	clAmdFftLayout inLayout;
	clAmdFftLayout outLayout; 
	size_t batchSize;
	clAmdFftDim	dim;
	size_t clLengths[ 3 ];

	clAmdFftStatus status;

	int DistanceIn, DistanceOut;
	int OutputWidth;

	int FFTSize;

	if(FFTType == 1){
		//Needed to the FFT does the proper size on the inverse
		FFTSize = (Width - 1)*2;
	}else{
		FFTSize = Width;
	}

	switch(Dimension){
		case 0:
			//1D FFT
			batchSize = Depth*Height;
			dim = CLFFT_1D;
			clLengths[0] = FFTSize;
			clLengths[1] = 1;
			clLengths[2] = 1;
		break;

		case 1:
			//2D FFT
			batchSize = Depth;	
			dim = CLFFT_2D;
			clLengths[0] = FFTSize;
			clLengths[1] = Height;
			clLengths[2] = 1;
		break;
	
		case 2:
			//3D FFT
			batchSize = 1;
			dim = CLFFT_3D;
			clLengths[0] = FFTSize;
			clLengths[1] = Height;
			clLengths[2] = Depth;
		break;
	}

	switch(FFTType){
		//These cases are defined by the labview Type Def
		
		case 0:
			inLayout = CLFFT_REAL;
			outLayout = CLFFT_HERMITIAN_PLANAR;
			//*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0]/2 + 1)*batchSize*sizeof(std::complex<float>)/sizeof(float);
			OutputWidth = FFTSize/2 + 1;
			*OutputWidthFloat = OutputWidth;
			DistanceIn = Width*clLengths[1]*clLengths[2];
			DistanceOut = OutputWidth*clLengths[1]*clLengths[2];
		break;

		case 1:
			inLayout = CLFFT_HERMITIAN_PLANAR;
			outLayout = CLFFT_REAL;
			//*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0] - 1)*2*batchSize;
			OutputWidth = FFTSize;
			*OutputWidthFloat = OutputWidth;
			DistanceIn = Width*clLengths[1]*clLengths[2];
			DistanceOut = OutputWidth*clLengths[1]*clLengths[2];
		break;			
		
		case 2:
			inLayout = CLFFT_COMPLEX_PLANAR;
			outLayout = CLFFT_COMPLEX_PLANAR;
			//*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0])*batchSize*sizeof(std::complex<float>)/sizeof(float);
			OutputWidth = FFTSize;
			*OutputWidthFloat = OutputWidth;
			DistanceIn = Width*clLengths[1]*clLengths[2];
			DistanceOut = OutputWidth*clLengths[1]*clLengths[2];
			place = CLFFT_INPLACE;
		break;
	}

	size_t fftVectorSize= 0, fftVectorSizePadded = 0, fftBatchSize = 0;
	
	clStridesIn[0] = 1;
	clStridesIn[ 1 ] = (clStridesIn[ 0 ])*(Width + PaddingW);
	clStridesIn[ 2 ] = (clStridesIn[ 1 ])*(clLengths[ 1 ] + PaddingH);
	clStridesIn[ 3 ] = (clStridesIn[ 2 ])*(clLengths[ 2 ] + PaddingD);

	clStridesOut[0] = 1;
	clStridesOut[ 1 ] = (clStridesOut[ 0 ])*(OutputWidth + PaddingW);
	clStridesOut[ 2 ] = (clStridesOut[ 1 ])*(clLengths[ 1 ] + PaddingH);
	clStridesOut[ 3 ] = (clStridesOut[ 2 ])*(clLengths[ 2 ] + PaddingD);

	fftVectorSize	= clLengths[ 0 ] * clLengths[ 1 ] * clLengths[ 2 ];
	fftVectorSizePadded = clStridesIn[ 3 ];
	fftBatchSize	= fftVectorSizePadded * batchSize;

	status = clAmdFftCreateDefaultPlan( &plHandle, d->GetContext(), dim, clLengths );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_DEFAULT_PLAN_FAILED);
		return NULL;
	}

	status = clAmdFftSetResultLocation( plHandle, place );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_RESULT_FAILED);
		return NULL;
	}

	status = clAmdFftSetLayout( plHandle, inLayout, outLayout );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_LAYOUT_FAILED);
		return NULL;
	}

	if(SingleOrDouble == 0){
		status = clAmdFftSetPlanPrecision(plHandle, CLFFT_SINGLE);
	}else{
		status = clAmdFftSetPlanPrecision(plHandle, CLFFT_DOUBLE);
	}

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_PRECISION_FAILED);
		return NULL;
	}

	status = clAmdFftSetPlanBatchSize( plHandle, batchSize );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_BATCHSIZE_FAILED);
		return NULL;
	}
	
	status = clAmdFftSetPlanInStride  ( plHandle, dim, clStridesIn );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_STRIDES_IN_FAILED);
		return NULL;
	}

	status = clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_STRIDES_OUT_FAILED);
		return NULL;
	}

	status = clAmdFftSetPlanDistance  ( plHandle, DistanceIn, DistanceOut);

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_SET_PLAN_DIST_FAILED);
		return NULL;
	}

	status = clAmdFftBakePlan( plHandle, 0, d->GetQueuePtr(), NULL, NULL );

	if(status != 0){
		*Error = clLabviewDevice::Error(OPENCLV_FFT_BAKE_PLAN_FAILED);
		return NULL;
	}

#endif
	
	return plHandle;


}