void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils) { if (!gpuMemAllocated) return; cufftDestroy(fft_plan); // Destroy the cuFFT plan. if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); freeLookupTable(); freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop if (n_coils > 1 && deapo_d != NULL) cudaFree(deapo_d); if (this->applySensData()) cudaFree(sens_d); if (this->applyDensComp()) cudaFree(density_comp_d); showMemoryInfo(); gpuMemAllocated = false; }
// ---------------------------------------------------------------------------- // gpuNUFFT_gpu: NUFFT // // Inverse gpuNUFFT implementation - interpolation from uniform grid data onto // nonuniform k-space data based on optimized // gpuNUFFT kernel with minimal oversampling // ratio (see Beatty et al.) // // Basic steps: - apodization correction // - zero padding with osf // - FFT // - convolution and resampling // // parameters: // * data : output kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinates on trajectory, passed as SoA // * imdata : input image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of data indices according to each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing forward gpuNUFFT!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; } showMemoryInfo(); if (debugTiming) startTiming(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); //cuda mem allocation DType2 *imdata_d; CufftType *data_d; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<DType2>(&imdata_d,imdata_count); if (DEBUG) printf("allocate and copy data of size %d...\n",data_count); allocateDeviceMem<CufftType>(&data_d,data_count); initDeviceMemory(n_coils); if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); int err; //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count; if (this->applySensData()) // perform automatically "repeating" of input image in case // of existing sensitivity data copyToDevice(imgData.data,imdata_d,imdata_count); else copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count); //reset temp arrays cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); cudaMemset(data_d,0, sizeof(CufftType)*data_count); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,false); } // apodization Correction if (n_coils > 1 && deapo_d != NULL) performForwardDeapodization(imdata_d,deapo_d,gi_host); else performForwardDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad performPadding(imdata_d,gdata_d,gi_host); if (debugTiming) startTiming(); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d // Forward FFT to kspace domain if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("FFT (incl. shift): %.2f ms\n",stopTiming()); if (debugTiming) startTiming(); // convolution and resampling to non-standard trajectory forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("Forward Convolution: %.2f ms\n",stopTiming()); performFFTScaling(data_d,gi_host->data_count,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); //write result in correct order back into output array writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count()); copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count); }//iterate over coils freeTotalDeviceMemory(data_d,imdata_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
// ---------------------------------------------------------------------------- // performGpuNUFFTAdj: NUFFT^H // // GpuNUFFT implementation - interpolation from nonuniform k-space data onto // oversampled grid based on optimized gpuNUFFT kernel // with minimal oversampling ratio (see Beatty et al.) // // Basic steps: - density compensation // - convolution with interpolation function // - iFFT // - cropping due to oversampling ratio // - apodization correction // // parameters: // * data : input kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinate array on trajectory // * imdata : output image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of start and end points of each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * do_comp : true, if density compensation has to be done // * density_comp : densiy compensation array // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(gpuNUFFT::Array<DType2> kspaceData, gpuNUFFT::Array<CufftType>& imgData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing gpuNUFFT adjoint!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; std::cout << "apply density comp: " << this->applyDensComp() << std::endl; std::cout << "apply sens data: " << this->applySensData() << std::endl; } if (debugTiming) startTiming(); showMemoryInfo(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); // select data ordered and leave it on gpu DType2* data_d; if (DEBUG) printf("allocate data of size %d...\n",data_count); allocateDeviceMem<DType2>(&data_d,data_count); CufftType *imdata_d, *imdata_sum_d = NULL; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_d,imdata_count); if (this->applySensData()) { if (DEBUG) printf("allocate and copy temp imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_sum_d,imdata_count); cudaMemset(imdata_sum_d,0,imdata_count*sizeof(CufftType)); } initDeviceMemory(n_coils); int err; if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count;//gi_host->width_dim; cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); //copy coil data to device and select ordered copyToDevice(kspaceData.data + data_coil_offset,data_d,data_count); selectOrderedGPU(data_d,data_indices_d,data_sorted_d,data_count); if (this->applyDensComp()) performDensityCompensation(data_sorted_d,density_comp_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); adjConvolution(data_sorted_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (debugTiming) printf("Adjoint convolution: %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == CONVOLUTION) { if (DEBUG) printf("stopping output after CONVOLUTION step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); if (DEBUG) printf("test value at point zero: %f\n",(imgData.data)[0].x); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); return; } if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); //Inverse FFT if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_INVERSE) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed at adj with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == FFT) { if (DEBUG) printf("stopping output after FFT step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); printf("last cuda error: %s\n", cudaGetErrorString(cudaGetLastError())); return; } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (debugTiming) printf("iFFT (incl. shift) : %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); performCrop(gdata_d,imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); //check if precomputed deapo function can be used if (n_coils > 1 && deapo_d != NULL) performDeapodization(imdata_d,deapo_d,gi_host); else performDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTScaling(imdata_d,gi_host->im_width_dim,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,true); performSensSum(imdata_d,imdata_sum_d,gi_host); } else { // get result per coil // no summation is performed in absence of sensitity data copyFromDevice<CufftType>(imdata_d,imgData.data+im_coil_offset,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 10: %s\n",cudaGetErrorString(cudaGetLastError())); }//iterate over coils if (this->applySensData()) { // get result of combined coils copyFromDevice<CufftType>(imdata_sum_d,imgData.data,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 11: %s\n",cudaGetErrorString(cudaGetLastError())); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in gpuNUFFT_gpu_adj function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
TEST(TestForwardBackward,Test_GpuArray) { //Test the same as above but use GpuArray data structure int kernel_width = 3; float osf = 1.25;//oversampling ratio int sector_width = 8; //Data int data_entries = 2; DType2* data = (DType2*) calloc(data_entries,sizeof(DType2)); //2* re + im data[0].x = 5;//Re data[0].y = 0;//Im data[1].x = 1;//Re data[1].y = 0;//Im //Coords //Scaled between -0.5 and 0.5 //in triplets (x,y,z) as structure of array //p0 = (0,0,0) //p1 0 (0.25,0.25,0.25) DType* coords = (DType*) calloc(3*data_entries,sizeof(DType));//3* x,y,z coords[0] = 0.00; //x0 coords[1] = 0.25; //x1 coords[2] = 0.00; //y0 coords[3] = 0.25; //y0 coords[4] = 0.00; //z0 coords[5] = 0.25; //z1 //Input data array, complex values //and copy to GPU gpuNUFFT::GpuArray<DType2> dataArray_gpu; dataArray_gpu.dim.length = data_entries; allocateAndCopyToDeviceMem<DType2>(&dataArray_gpu.data,data,data_entries); //Input array containing trajectory in k-space gpuNUFFT::Array<DType> kSpaceData; kSpaceData.data = coords; kSpaceData.dim.length = data_entries; gpuNUFFT::Dimensions imgDims; imgDims.width = 64; imgDims.height = 64; imgDims.depth = 64; //precomputation performed by factory gpuNUFFT::GpuNUFFTOperatorFactory factory; gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp = factory.createGpuNUFFTOperator(kSpaceData,kernel_width,sector_width,osf,imgDims); //Output Array gpuNUFFT::GpuArray<CufftType> imgArray_gpu; imgArray_gpu.dim = imgDims; allocateDeviceMem<CufftType>(&imgArray_gpu.data,imgArray_gpu.count()); //Perform FT^H Operation gpuNUFFTOp->performGpuNUFFTAdj(dataArray_gpu, imgArray_gpu); //Perform FT Operation gpuNUFFTOp->performForwardGpuNUFFT(imgArray_gpu,dataArray_gpu); copyFromDevice(dataArray_gpu.data,data,data_entries); printf("contrast %f \n",data[0].x/data[1].x); EXPECT_NEAR(data[0].x/data[1].x,5.0,epsilon); free(data); free(coords); freeTotalDeviceMemory(dataArray_gpu.data,imgArray_gpu.data,NULL); delete gpuNUFFTOp; }