void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils)
{
  if (!gpuMemAllocated)
    return;

  cufftDestroy(fft_plan);
  // Destroy the cuFFT plan.
  if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
    printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));
  freeLookupTable();
  
  freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop
  
  if (n_coils > 1 && deapo_d != NULL)
    cudaFree(deapo_d);
  
  if (this->applySensData())
    cudaFree(sens_d);
  
  if (this->applyDensComp())
    cudaFree(density_comp_d);

  showMemoryInfo();
  gpuMemAllocated = false;
}
// ----------------------------------------------------------------------------
// gpuNUFFT_gpu: NUFFT
//
// Inverse gpuNUFFT implementation - interpolation from uniform grid data onto 
//                                   nonuniform k-space data based on optimized 
//                                   gpuNUFFT kernel with minimal oversampling
//                                   ratio (see Beatty et al.)
//
// Basic steps: - apodization correction
//              - zero padding with osf
//              - FFT
//							- convolution and resampling
//
// parameters:
//	* data		     : output kspace data 
//  * data_count   : number of samples on trajectory
//  * n_coils      : number of channels or coils
//  * crds         : coordinates on trajectory, passed as SoA
//  * imdata       : input image data
//  * imdata_count : number of image data points
//  * grid_width   : size of grid 
//  * kernel       : precomputed convolution kernel as lookup table
//  * kernel_count : number of kernel lookup table entries
//  * sectors      : mapping of data indices according to each sector
//  * sector_count : number of sectors
//  * sector_centers: coordinates (x,y,z) of sector centers
//  * sector_width : width of sector 
//  * im_width     : dimension of image
//  * osr          : oversampling ratio
//  * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed
//  
void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut)
{
  if (DEBUG)
  {
    std::cout << "performing forward gpuNUFFT!!!" << std::endl;
    std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl;
    std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl;
  }
  showMemoryInfo();
  
  if (debugTiming)
    startTiming();

  int			data_count          = (int)this->kSpaceTraj.count();
  int			n_coils             = (int)kspaceData.dim.channels;
  IndType	imdata_count        = this->imgDims.count();
  int			sector_count        = (int)this->gridSectorDims.count();

  //cuda mem allocation
  DType2 *imdata_d;
  CufftType *data_d;

  if (DEBUG)
    printf("allocate and copy imdata of size %d...\n",imdata_count);
  allocateDeviceMem<DType2>(&imdata_d,imdata_count);

  if (DEBUG)
    printf("allocate and copy data of size %d...\n",data_count);
  allocateDeviceMem<CufftType>(&data_d,data_count);

  initDeviceMemory(n_coils);
    
  if (debugTiming)
    printf("Memory allocation: %.2f ms\n",stopTiming());

  int err;

  //iterate over coils and compute result
  for (int coil_it = 0; coil_it < n_coils; coil_it++)
  {
    int data_coil_offset = coil_it * data_count;
    int im_coil_offset = coil_it * (int)imdata_count;

    if (this->applySensData())
      // perform automatically "repeating" of input image in case
      // of existing sensitivity data
      copyToDevice(imgData.data,imdata_d,imdata_count);
    else
      copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count);

    //reset temp arrays
    cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim);
    cudaMemset(data_d,0, sizeof(CufftType)*data_count);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (this->applySensData())
    {
      copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count);
      performSensMul(imdata_d,sens_d,gi_host,false);
    }

    // apodization Correction
    if (n_coils > 1 && deapo_d != NULL)
      performForwardDeapodization(imdata_d,deapo_d,gi_host);
    else
      performForwardDeapodization(imdata_d,gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError()));
    // resize by oversampling factor and zero pad
    performPadding(imdata_d,gdata_d,gi_host);

    if (debugTiming)
      startTiming();

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError()));
    // shift image to get correct zero frequency position
    performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError()));
    // eventually free imdata_d
    // Forward FFT to kspace domain
    if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS)
    {
      fprintf(stderr,"cufft has failed with err %i \n",err);
      showMemoryInfo(true,stderr);
    }

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError()));
    performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError()));

    if (debugTiming)
      printf("FFT (incl. shift): %.2f ms\n",stopTiming());

    if (debugTiming)
      startTiming();

    // convolution and resampling to non-standard trajectory
    forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (debugTiming)
      printf("Forward Convolution: %.2f ms\n",stopTiming());

    performFFTScaling(data_d,gi_host->data_count,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    //write result in correct order back into output array
    writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count());
    copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count);
  }//iterate over coils
  
  freeTotalDeviceMemory(data_d,imdata_d,NULL);
  freeDeviceMemory(n_coils);

  if ((cudaThreadSynchronize() != cudaSuccess))
    fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError()));
  free(gi_host);
}
// ----------------------------------------------------------------------------
// performGpuNUFFTAdj: NUFFT^H
//
// GpuNUFFT implementation - interpolation from nonuniform k-space data onto 
//                           oversampled grid based on optimized gpuNUFFT kernel
//                           with minimal oversampling ratio (see Beatty et al.)
//
// Basic steps: - density compensation
//              - convolution with interpolation function
//              - iFFT
//              - cropping due to oversampling ratio
//              - apodization correction
//
// parameters:
//	* data		     : input kspace data 
//  * data_count   : number of samples on trajectory
//  * n_coils      : number of channels or coils
//  * crds         : coordinate array on trajectory
//  * imdata       : output image data
//  * imdata_count : number of image data points
//  * grid_width   : size of grid 
//  * kernel       : precomputed convolution kernel as lookup table
//  * kernel_count : number of kernel lookup table entries
//  * sectors      : mapping of start and end points of each sector
//  * sector_count : number of sectors
//  * sector_centers: coordinates (x,y,z) of sector centers
//  * sector_width : width of sector 
//  * im_width     : dimension of image
//  * osr          : oversampling ratio
//  * do_comp      : true, if density compensation has to be done
//  * density_comp : densiy compensation array
//  * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed
//  
void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(gpuNUFFT::Array<DType2> kspaceData, gpuNUFFT::Array<CufftType>& imgData, GpuNUFFTOutput gpuNUFFTOut)
{
  if (DEBUG)
  {
    std::cout << "performing gpuNUFFT adjoint!!!" << std::endl;
    std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl;
    std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl;
    std::cout << "apply density comp: " << this->applyDensComp() << std::endl;
    std::cout << "apply sens data: " << this->applySensData() << std::endl;
  }
  if (debugTiming)
    startTiming();

  showMemoryInfo();

  int			data_count          = (int)this->kSpaceTraj.count();
  int			n_coils             = (int)kspaceData.dim.channels;
  IndType imdata_count        = this->imgDims.count();
  int			sector_count        = (int)this->gridSectorDims.count();

  // select data ordered and leave it on gpu
  DType2* data_d;

  if (DEBUG)
    printf("allocate data of size %d...\n",data_count);
  allocateDeviceMem<DType2>(&data_d,data_count);

  CufftType *imdata_d, *imdata_sum_d = NULL;

  if (DEBUG)
    printf("allocate and copy imdata of size %d...\n",imdata_count);
  allocateDeviceMem<CufftType>(&imdata_d,imdata_count);

  if (this->applySensData())
  {
    if (DEBUG)
      printf("allocate and copy temp imdata of size %d...\n",imdata_count);
    allocateDeviceMem<CufftType>(&imdata_sum_d,imdata_count);
    cudaMemset(imdata_sum_d,0,imdata_count*sizeof(CufftType));
  }

  initDeviceMemory(n_coils);
  int err;

  if (debugTiming)
    printf("Memory allocation: %.2f ms\n",stopTiming());

  //iterate over coils and compute result
  for (int coil_it = 0; coil_it < n_coils; coil_it++)
  {
    int data_coil_offset = coil_it * data_count;
    int im_coil_offset = coil_it * (int)imdata_count;//gi_host->width_dim;

    cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim);
    //copy coil data to device and select ordered
    copyToDevice(kspaceData.data + data_coil_offset,data_d,data_count);
    selectOrderedGPU(data_d,data_indices_d,data_sorted_d,data_count);

    if (this->applyDensComp())
      performDensityCompensation(data_sorted_d,density_comp_d,gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at adj thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (debugTiming)
      startTiming();

    adjConvolution(data_sorted_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host);

    if (debugTiming)
      printf("Adjoint convolution: %.2f ms\n",stopTiming());

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      fprintf(stderr,"error at adj  thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError()));
    if (gpuNUFFTOut == CONVOLUTION)
    {
      if (DEBUG)
        printf("stopping output after CONVOLUTION step\n");
      //get output
      copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim);
      if (DEBUG)
        printf("test value at point zero: %f\n",(imgData.data)[0].x);

      free(gi_host);
      freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL);
      freeDeviceMemory(n_coils);
      return;
    }
    if ((cudaThreadSynchronize() != cudaSuccess))
      fprintf(stderr,"error at adj thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (debugTiming)
      startTiming();

    performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host);

    //Inverse FFT
    if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_INVERSE) != CUFFT_SUCCESS)
    {
      fprintf(stderr,"cufft has failed at adj with err %i \n",err);
      showMemoryInfo(true,stderr);
    }
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      fprintf(stderr,"error at adj thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError()));

    if (gpuNUFFTOut == FFT)
    {
      if (DEBUG)
        printf("stopping output after FFT step\n");
      //get output
      copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim);

      free(gi_host);
      
      freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL);
      freeDeviceMemory(n_coils);
      
      printf("last cuda error: %s\n", cudaGetErrorString(cudaGetLastError()));
      return;
    }
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at adj thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError()));
    performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host);
    
    if (debugTiming)
      printf("iFFT (incl. shift) : %.2f ms\n",stopTiming());

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at adj thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError()));
    performCrop(gdata_d,imdata_d,gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at adj thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError()));
    //check if precomputed deapo function can be used
    if (n_coils > 1 && deapo_d != NULL)
      performDeapodization(imdata_d,deapo_d,gi_host);
    else
      performDeapodization(imdata_d,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at adj thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError()));

    performFFTScaling(imdata_d,gi_host->im_width_dim,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error: at adj  thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));

    if (this->applySensData())
    {
      copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count);
      performSensMul(imdata_d,sens_d,gi_host,true);
      performSensSum(imdata_d,imdata_sum_d,gi_host);
    }
    else
    {
      // get result per coil
      // no summation is performed in absence of sensitity data
      copyFromDevice<CufftType>(imdata_d,imgData.data+im_coil_offset,imdata_count);
    }

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error: at adj  thread synchronization 10: %s\n",cudaGetErrorString(cudaGetLastError()));
  }//iterate over coils
  
  if (this->applySensData())
  {
    // get result of combined coils
    copyFromDevice<CufftType>(imdata_sum_d,imgData.data,imdata_count);
  }

  if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
    printf("error: at adj  thread synchronization 11: %s\n",cudaGetErrorString(cudaGetLastError()));
  
  freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL);
  freeDeviceMemory(n_coils);
  if ((cudaThreadSynchronize() != cudaSuccess))
    fprintf(stderr,"error in gpuNUFFT_gpu_adj function: %s\n",cudaGetErrorString(cudaGetLastError()));
  free(gi_host);
}
TEST(TestForwardBackward,Test_GpuArray)
{
  //Test the same as above but use GpuArray data structure  

  int kernel_width = 3;
	float osf = 1.25;//oversampling ratio
	int sector_width = 8;
	
	//Data
	int data_entries = 2;
  DType2* data = (DType2*) calloc(data_entries,sizeof(DType2)); //2* re + im
	data[0].x = 5;//Re
	data[0].y = 0;//Im
	data[1].x = 1;//Re
	data[1].y = 0;//Im

	//Coords
	//Scaled between -0.5 and 0.5
	//in triplets (x,y,z) as structure of array
	//p0 = (0,0,0)
	//p1 0 (0.25,0.25,0.25)
  DType* coords = (DType*) calloc(3*data_entries,sizeof(DType));//3* x,y,z
	coords[0] = 0.00; //x0
	coords[1] = 0.25; //x1
	
	coords[2] = 0.00; //y0
	coords[3] = 0.25; //y0
	
	coords[4] = 0.00; //z0
	coords[5] = 0.25; //z1

  //Input data array, complex values
  //and copy to GPU 
	gpuNUFFT::GpuArray<DType2> dataArray_gpu;
	dataArray_gpu.dim.length = data_entries;

  allocateAndCopyToDeviceMem<DType2>(&dataArray_gpu.data,data,data_entries);
	
	//Input array containing trajectory in k-space
	gpuNUFFT::Array<DType> kSpaceData;
  kSpaceData.data = coords;
  kSpaceData.dim.length = data_entries;

	gpuNUFFT::Dimensions imgDims;
	imgDims.width = 64;
	imgDims.height = 64;
	imgDims.depth = 64;

  //precomputation performed by factory
  gpuNUFFT::GpuNUFFTOperatorFactory factory; 
  gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp = factory.createGpuNUFFTOperator(kSpaceData,kernel_width,sector_width,osf,imgDims);

	//Output Array
	gpuNUFFT::GpuArray<CufftType> imgArray_gpu;
	imgArray_gpu.dim = imgDims;
  allocateDeviceMem<CufftType>(&imgArray_gpu.data,imgArray_gpu.count());
	
	//Perform FT^H Operation
	gpuNUFFTOp->performGpuNUFFTAdj(dataArray_gpu, imgArray_gpu);
	
	//Perform FT Operation
  gpuNUFFTOp->performForwardGpuNUFFT(imgArray_gpu,dataArray_gpu);
	copyFromDevice(dataArray_gpu.data,data,data_entries);
	
	printf("contrast %f \n",data[0].x/data[1].x);
  EXPECT_NEAR(data[0].x/data[1].x,5.0,epsilon);

	free(data);
	free(coords);
  freeTotalDeviceMemory(dataArray_gpu.data,imgArray_gpu.data,NULL);

  delete gpuNUFFTOp;
}