GPUMatrix::GPUMatrix(const Matrix& hostMatrix) : _rows(hostMatrix.rows()), _cols(hostMatrix.cols()), _data(0) { initDeviceMemory(); // Copy data to the device. cublasStatus_t cublasStat = cublasSetMatrix( hostMatrix.rows(), hostMatrix.cols(), sizeof(*hostMatrix._data), hostMatrix._data, hostMatrix.rows(), _data, _rows ); if (cublasStat != CUBLAS_STATUS_SUCCESS) { throw std::runtime_error("Data transfer to GPU failed!"); } }
// ---------------------------------------------------------------------------- // gpuNUFFT_gpu: NUFFT // // Inverse gpuNUFFT implementation - interpolation from uniform grid data onto // nonuniform k-space data based on optimized // gpuNUFFT kernel with minimal oversampling // ratio (see Beatty et al.) // // Basic steps: - apodization correction // - zero padding with osf // - FFT // - convolution and resampling // // parameters: // * data : output kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinates on trajectory, passed as SoA // * imdata : input image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of data indices according to each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing forward gpuNUFFT!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; } showMemoryInfo(); if (debugTiming) startTiming(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); //cuda mem allocation DType2 *imdata_d; CufftType *data_d; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<DType2>(&imdata_d,imdata_count); if (DEBUG) printf("allocate and copy data of size %d...\n",data_count); allocateDeviceMem<CufftType>(&data_d,data_count); initDeviceMemory(n_coils); if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); int err; //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count; if (this->applySensData()) // perform automatically "repeating" of input image in case // of existing sensitivity data copyToDevice(imgData.data,imdata_d,imdata_count); else copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count); //reset temp arrays cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); cudaMemset(data_d,0, sizeof(CufftType)*data_count); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,false); } // apodization Correction if (n_coils > 1 && deapo_d != NULL) performForwardDeapodization(imdata_d,deapo_d,gi_host); else performForwardDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad performPadding(imdata_d,gdata_d,gi_host); if (debugTiming) startTiming(); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d // Forward FFT to kspace domain if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("FFT (incl. shift): %.2f ms\n",stopTiming()); if (debugTiming) startTiming(); // convolution and resampling to non-standard trajectory forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("Forward Convolution: %.2f ms\n",stopTiming()); performFFTScaling(data_d,gi_host->data_count,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); //write result in correct order back into output array writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count()); copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count); }//iterate over coils freeTotalDeviceMemory(data_d,imdata_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
// ---------------------------------------------------------------------------- // performGpuNUFFTAdj: NUFFT^H // // GpuNUFFT implementation - interpolation from nonuniform k-space data onto // oversampled grid based on optimized gpuNUFFT kernel // with minimal oversampling ratio (see Beatty et al.) // // Basic steps: - density compensation // - convolution with interpolation function // - iFFT // - cropping due to oversampling ratio // - apodization correction // // parameters: // * data : input kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinate array on trajectory // * imdata : output image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of start and end points of each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * do_comp : true, if density compensation has to be done // * density_comp : densiy compensation array // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(gpuNUFFT::Array<DType2> kspaceData, gpuNUFFT::Array<CufftType>& imgData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing gpuNUFFT adjoint!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; std::cout << "apply density comp: " << this->applyDensComp() << std::endl; std::cout << "apply sens data: " << this->applySensData() << std::endl; } if (debugTiming) startTiming(); showMemoryInfo(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); // select data ordered and leave it on gpu DType2* data_d; if (DEBUG) printf("allocate data of size %d...\n",data_count); allocateDeviceMem<DType2>(&data_d,data_count); CufftType *imdata_d, *imdata_sum_d = NULL; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_d,imdata_count); if (this->applySensData()) { if (DEBUG) printf("allocate and copy temp imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_sum_d,imdata_count); cudaMemset(imdata_sum_d,0,imdata_count*sizeof(CufftType)); } initDeviceMemory(n_coils); int err; if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count;//gi_host->width_dim; cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); //copy coil data to device and select ordered copyToDevice(kspaceData.data + data_coil_offset,data_d,data_count); selectOrderedGPU(data_d,data_indices_d,data_sorted_d,data_count); if (this->applyDensComp()) performDensityCompensation(data_sorted_d,density_comp_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); adjConvolution(data_sorted_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (debugTiming) printf("Adjoint convolution: %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == CONVOLUTION) { if (DEBUG) printf("stopping output after CONVOLUTION step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); if (DEBUG) printf("test value at point zero: %f\n",(imgData.data)[0].x); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); return; } if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); //Inverse FFT if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_INVERSE) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed at adj with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == FFT) { if (DEBUG) printf("stopping output after FFT step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); printf("last cuda error: %s\n", cudaGetErrorString(cudaGetLastError())); return; } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (debugTiming) printf("iFFT (incl. shift) : %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); performCrop(gdata_d,imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); //check if precomputed deapo function can be used if (n_coils > 1 && deapo_d != NULL) performDeapodization(imdata_d,deapo_d,gi_host); else performDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTScaling(imdata_d,gi_host->im_width_dim,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,true); performSensSum(imdata_d,imdata_sum_d,gi_host); } else { // get result per coil // no summation is performed in absence of sensitity data copyFromDevice<CufftType>(imdata_d,imgData.data+im_coil_offset,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 10: %s\n",cudaGetErrorString(cudaGetLastError())); }//iterate over coils if (this->applySensData()) { // get result of combined coils copyFromDevice<CufftType>(imdata_sum_d,imgData.data,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 11: %s\n",cudaGetErrorString(cudaGetLastError())); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in gpuNUFFT_gpu_adj function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
GPUMatrix::GPUMatrix(unsigned int rows, unsigned int cols) : _rows(rows), _cols(cols) { initDeviceMemory(); }