int main() { //Control Variables bool showStartInput=false;// Setting it to true shows the original Input bool showFftOutput=false;// Shows the output after the FFT but before the Reshuffle bool showReshuffleOutput=false;// Shows the output after the reshuffle bool showFinalResult=false; // Shows final result after cross-correlation bool showGemmInput=false; // Shows output after the reshuffle but before the matrix multiplication bool showReformatOutputAfterReshuffle=false; // Shows output after it has been reformatted after the reshuffling //openCL State cl_platform_id platform_id=NULL; cl_device_id device_id=NULL; cl_context context=NULL; cl_command_queue queue=NULL; cl_program program=NULL; cl_kernel kernel=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret=0; // Stores the error values retuned by many functions cl_event event = NULL; cl_event events[10]; cl_kernel clKernel; //FFT state clAmdFftPlanHandle plHandle; clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //Alternative CLFFT_INPLACE clAmdFftLayout inLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftLayout outLayout = CLFFT_COMPLEX_INTERLEAVED; clAmdFftDim dim = CLFFT_1D; size_t clStrides[3]={0,0,0}; size_t clLengths[3]; clLengths[0]=(MEM_SIZE/2);//Length of first dimension of fft clLengths[1]=1;//length of second dimension of fft clLengths[2]=1; clStrides[ 0 ] = 1; clStrides[ 1 ] = clStrides[ 0 ] * clLengths[ 0 ]; clStrides[ 2 ] = clStrides[ 1 ] * clLengths[ 1 ]; clStrides[ 3 ] = clStrides[ 2 ] * clLengths[ 2 ]; size_t batchSize=CHANSIZE;//number of discreet fft's to be calculated simultaneously //Initialise openCL OPENCL_V_THROW(clGetPlatformIDs(1, &platform_id, &ret_num_platforms),"clGetPlatformIDs Failed"); OPENCL_V_THROW(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&ret_num_devices),"clGetDeviceIDs Failed"); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); OPENCL_V_THROW(ret, "Creating Context failed" ); queue = clCreateCommandQueue(context, device_id, 0, &ret); OPENCL_V_THROW(ret, "Creating command queue failed" ); //===========Initialise the host buffers====================================== /* * The functions sgenerate2darray(), screate2darray() and sgenerate2darrayout() are defined and declared in definition.h */ float** src_a_h=sgenerate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the original input float** answer=screate2darray(NO_INPUTS,MEM_SIZE);//To be used to store the answer after the reshuffling float** corr_h=sgenerate2darrayout(NO_INPUTS,CHANSIZE << 1,CHANNELNO);// To be used to store the final answer if(showStartInput){ cout << "Initial Input Buffer" << "\n"; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << src_a_h[j][i] << " "; }cout << "\n"; }printf("\n"); } //=================================================================== //Calculation of facs for reshuffling complex <float>* facs_h=(complex <float>*) malloc(sizeof(complex <float>)*(MEM_SIZE/2)); complex<float> I=1.0i; complex <float> xx=2.0*PI; for(int i=0;i<MEM_SIZE/2;i++){ facs_h[i]=(1.0*i)/(1.0*MEM_SIZE); facs_h[i]=exp(xx*(-I*facs_h[i])); } //=================================================================== //Initialise GPU memory buffers size_t sizeofgpumem=NO_INPUTS*MEM_SIZE*sizeof(float); size_t sizeoffacsmem=MEM_SIZE*sizeof(float); cl_mem clMemBuffersIn = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeofgpumem,src_a_h[0],&ret); OPENCL_V_THROW( ret, "Creating clMemBuffersIn Buffer failed" ); cl_mem clMemBuffersOut = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeofgpumem,NULL,&ret); OPENCL_V_THROW (ret, "Creating fft output Buffer failed"); cl_mem facs = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeoffacsmem,facs_h,&ret); OPENCL_V_THROW (ret, "Creating facs Buffer failed"); //===========================Starting the fft=============================// clAmdFftSetupData setupData; OPENCL_V_THROW( clAmdFftInitSetupData( &setupData ),"clAmdFftInitSetupData failed" ); OPENCL_V_THROW( clAmdFftSetup( &setupData ), "clAmdFftSetup failed" ); OPENCL_V_THROW( clAmdFftCreateDefaultPlan( &plHandle, context, dim, clLengths ), "clAmdFftCreateDefaultPlan failed" ); OPENCL_V_THROW (clAmdFftSetPlanBatchSize (plHandle, batchSize),"Setting BatchSize Failed"); OPENCL_V_THROW (clAmdFftSetResultLocation( plHandle, place ), "clAmdFftSetResultLocation failed" ); OPENCL_V_THROW (clAmdFftSetPlanInStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanInStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanOutStride ( plHandle, dim, clStrides ), "clAmdFftSetPlanOutStride failed" ); OPENCL_V_THROW (clAmdFftSetPlanDistance ( plHandle, clStrides[ dim ], clStrides[ dim ]), "clAmdFftSetPlanDistance failed" ); OPENCL_V_THROW( clAmdFftBakePlan( plHandle, 1, &queue, NULL, NULL ), "clAmdFftBakePlan failed" ); size_t tempbuffersize=0; OPENCL_V_THROW( clAmdFftGetTmpBufSize(plHandle, &tempbuffersize ), "clAmdFftGetTmpBufSize failed" ); //allocate the intermediate buffer cl_mem clMedBuffer=NULL; if (tempbuffersize) { cl_int medstatus; clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,tempbuffersize, 0, &medstatus); OPENCL_V_THROW( medstatus, "Creating fft intermediate Buffer failed" ); } if (( place == CLFFT_INPLACE )&& ( inLayout != outLayout )) { switch( inLayout ) { case CLFFT_COMPLEX_INTERLEAVED: { assert (CLFFT_COMPLEX_PLANAR == outLayout); throw std::runtime_error( "Cannot use the same buffer for interleaved->planar in-place transforms" ); break; } case CLFFT_COMPLEX_PLANAR: { assert (CLFFT_COMPLEX_INTERLEAVED == outLayout); throw std::runtime_error( "Cannot use the same buffer for planar->interleaved in-place transforms" ); break; } } } cl_mem * BuffersOut = ( place == CLFFT_INPLACE ) ? NULL : &clMemBuffersOut; //========Timimg fft============// double time_fft_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW( clAmdFftEnqueueTransform( plHandle, CLFFT_FORWARD, 1,&queue,0,NULL,&event,&clMemBuffersIn,BuffersOut,clMedBuffer ),"clAmdFftEnqueueTransform failed" ); } ret=clWaitForEvents(1,&event); double time_fft_end=omp_get_wtime(); //Cleaning up fft OPENCL_V_THROW( clAmdFftDestroyPlan( &plHandle ), "clAmdFftDestroyPlan failed" ); OPENCL_V_THROW( clAmdFftTeardown( ), "clAmdFftTeardown failed" ); //displaying results if(showFftOutput){ OPENCL_V_THROW( clEnqueueReadBuffer( queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer [0], 0, NULL, NULL ),"Reading the result buffer failed" ); cout << "**FFT Output**" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //==================End of FFT=============================================// //==================Start the Reshuffling==================================// FILE *fp; char fileName[]="./reshuffle.cl"; char* source_str=NULL; size_t source_size; //Load the source code containing the kernel/ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load reshuffle kernel.¥n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); //Preparation for building the Kernel program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret); OPENCL_V_THROW( ret, "Creating program with source failed for Reshuffle" ); OPENCL_V_THROW( clBuildProgram(program, 1, &device_id, NULL, NULL, NULL),"Build Program Failed for Reshuffle"); kernel = clCreateKernel(program, "reshuffle", &ret); OPENCL_V_THROW( ret, "Creating kernel failed for Reshuffle" ); //Set kernel parameters const int num=NO_INPUTS*MEM_SIZE; const int block=MEM_SIZE; OPENCL_V_THROW(clSetKernelArg(kernel, 0, sizeof(cl_mem), (float *)&clMemBuffersIn),"Passing argument 0 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 1, sizeof(cl_mem), (float *)&facs),"Passing arg 1 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 2, sizeof(cl_mem), (float *)&clMemBuffersOut),"Passing arg2 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 3, sizeof(int), (int *)&num),"Passing arg3 of reshuffle failed"); OPENCL_V_THROW(clSetKernelArg(kernel, 4, sizeof(int), (int *)&block),"Passing arg4 of reshuffle failed"); // Execute OpenCL Kernel // const size_t local_ws=NO_INPUTS*MEM_SIZE; const size_t global_ws=min(NO_THREAD_PER_BLOCK,MEM_SIZE);//ceil(MEM_SIZE/local_ws); //===========timing the reshuffle===============// double time_reshuffle_start=omp_get_wtime(); for(int i=0;i<ITER_FFT;i++){ OPENCL_V_THROW(clEnqueueNDRangeKernel(queue,kernel, 1, NULL,&local_ws,&global_ws, 0, NULL, NULL),"Reshuffle Kernel execution failed"); } double time_reshuffle_end=omp_get_wtime(); //Read back data OPENCL_V_THROW(clEnqueueReadBuffer(queue, clMemBuffersOut, CL_TRUE, 0, sizeofgpumem,answer[0], 0, NULL, NULL),"Reading back reshuffled data failed"); //====================Finish the reshuffling================================// if(showReshuffleOutput){ cout << "Output after reshuffling" << endl; for(int j=0;j<NO_INPUTS;j++){ for(int i=0;i<MEM_SIZE;i++){ cout << answer[j][i] << " "; } printf("\n"); }printf("\n"); } //=================Reformatting the input given to the matrix multiply===================================// float** answer_final=screate2darray(NO_INPUTS*2,MEM_SIZE/2); for(int i=0;i<NO_INPUTS;i++){ for(int j=0;j<MEM_SIZE;j++){ if(j&1) answer_final[(i<<1)+1][j >> 1]=answer[i][j]; else answer_final[(i<<1)][j >> 1]=answer[i][j]; } }
clAmdFftPlanHandle CreateFFTPlan(clLabviewDevice *d, int FFTType, int Dimension, int Width, int Height, int Depth, int StrideW, int StrideH, int StrideD, int StrideT, int PaddingW, int PaddingH, int PaddingD, size_t *OutputWidthFloat, int SingleOrDouble, int *Error){ clAmdFftPlanHandle plHandle = 0; #ifndef NO_OPENCL *Error = clLabviewDevice::Error(clLabviewDevice::SanitizeDevice(d)); if(*Error != 0) return NULL; clAmdFftResultLocation place = CLFFT_OUTOFPLACE; //INPLACE NOT SUPPORTED size_t clStridesIn[ 4 ]; size_t clStridesOut[ 4 ]; clAmdFftLayout inLayout; clAmdFftLayout outLayout; size_t batchSize; clAmdFftDim dim; size_t clLengths[ 3 ]; clAmdFftStatus status; int DistanceIn, DistanceOut; int OutputWidth; int FFTSize; if(FFTType == 1){ //Needed to the FFT does the proper size on the inverse FFTSize = (Width - 1)*2; }else{ FFTSize = Width; } switch(Dimension){ case 0: //1D FFT batchSize = Depth*Height; dim = CLFFT_1D; clLengths[0] = FFTSize; clLengths[1] = 1; clLengths[2] = 1; break; case 1: //2D FFT batchSize = Depth; dim = CLFFT_2D; clLengths[0] = FFTSize; clLengths[1] = Height; clLengths[2] = 1; break; case 2: //3D FFT batchSize = 1; dim = CLFFT_3D; clLengths[0] = FFTSize; clLengths[1] = Height; clLengths[2] = Depth; break; } switch(FFTType){ //These cases are defined by the labview Type Def case 0: inLayout = CLFFT_REAL; outLayout = CLFFT_HERMITIAN_PLANAR; //*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0]/2 + 1)*batchSize*sizeof(std::complex<float>)/sizeof(float); OutputWidth = FFTSize/2 + 1; *OutputWidthFloat = OutputWidth; DistanceIn = Width*clLengths[1]*clLengths[2]; DistanceOut = OutputWidth*clLengths[1]*clLengths[2]; break; case 1: inLayout = CLFFT_HERMITIAN_PLANAR; outLayout = CLFFT_REAL; //*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0] - 1)*2*batchSize; OutputWidth = FFTSize; *OutputWidthFloat = OutputWidth; DistanceIn = Width*clLengths[1]*clLengths[2]; DistanceOut = OutputWidth*clLengths[1]*clLengths[2]; break; case 2: inLayout = CLFFT_COMPLEX_PLANAR; outLayout = CLFFT_COMPLEX_PLANAR; //*BufferSize = clLengths[2]*clLengths[1]*(clLengths[0])*batchSize*sizeof(std::complex<float>)/sizeof(float); OutputWidth = FFTSize; *OutputWidthFloat = OutputWidth; DistanceIn = Width*clLengths[1]*clLengths[2]; DistanceOut = OutputWidth*clLengths[1]*clLengths[2]; place = CLFFT_INPLACE; break; } size_t fftVectorSize= 0, fftVectorSizePadded = 0, fftBatchSize = 0; clStridesIn[0] = 1; clStridesIn[ 1 ] = (clStridesIn[ 0 ])*(Width + PaddingW); clStridesIn[ 2 ] = (clStridesIn[ 1 ])*(clLengths[ 1 ] + PaddingH); clStridesIn[ 3 ] = (clStridesIn[ 2 ])*(clLengths[ 2 ] + PaddingD); clStridesOut[0] = 1; clStridesOut[ 1 ] = (clStridesOut[ 0 ])*(OutputWidth + PaddingW); clStridesOut[ 2 ] = (clStridesOut[ 1 ])*(clLengths[ 1 ] + PaddingH); clStridesOut[ 3 ] = (clStridesOut[ 2 ])*(clLengths[ 2 ] + PaddingD); fftVectorSize = clLengths[ 0 ] * clLengths[ 1 ] * clLengths[ 2 ]; fftVectorSizePadded = clStridesIn[ 3 ]; fftBatchSize = fftVectorSizePadded * batchSize; status = clAmdFftCreateDefaultPlan( &plHandle, d->GetContext(), dim, clLengths ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_DEFAULT_PLAN_FAILED); return NULL; } status = clAmdFftSetResultLocation( plHandle, place ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_RESULT_FAILED); return NULL; } status = clAmdFftSetLayout( plHandle, inLayout, outLayout ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_LAYOUT_FAILED); return NULL; } if(SingleOrDouble == 0){ status = clAmdFftSetPlanPrecision(plHandle, CLFFT_SINGLE); }else{ status = clAmdFftSetPlanPrecision(plHandle, CLFFT_DOUBLE); } if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_PRECISION_FAILED); return NULL; } status = clAmdFftSetPlanBatchSize( plHandle, batchSize ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_BATCHSIZE_FAILED); return NULL; } status = clAmdFftSetPlanInStride ( plHandle, dim, clStridesIn ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_STRIDES_IN_FAILED); return NULL; } status = clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_STRIDES_OUT_FAILED); return NULL; } status = clAmdFftSetPlanDistance ( plHandle, DistanceIn, DistanceOut); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_SET_PLAN_DIST_FAILED); return NULL; } status = clAmdFftBakePlan( plHandle, 0, d->GetQueuePtr(), NULL, NULL ); if(status != 0){ *Error = clLabviewDevice::Error(OPENCLV_FFT_BAKE_PLAN_FAILED); return NULL; } #endif return plHandle; }