TEST_F(TestFFT,Test8x11x4) { gpuNUFFT::Dimensions imgDims; imgDims.width = 8; imgDims.height = 11; imgDims.depth = 4; //Input data array, complex values gpuNUFFT::Array<CufftType> dataArray; dataArray.data = &data[0]; dataArray.dim = imgDims; CufftType* data_d; allocateAndCopyToDeviceMem<CufftType>(&data_d,dataArray.data,dataArray.count()); gpuNUFFT::GpuNUFFTInfo gi_host; gi_host.is2Dprocessing = false; gi_host.gridDims.x = imgDims.width; gi_host.gridDims.y = imgDims.height; gi_host.gridDims.z = imgDims.depth; gi_host.n_coils_cc = 1; initConstSymbol("GI",&gi_host,sizeof(gpuNUFFT::GpuNUFFTInfo)); debug("Input:", data, imgDims); performFFTShift(data_d, gpuNUFFT::FORWARD, imgDims, &gi_host); std::vector<CufftType> result(imgDims.count()); copyFromDevice(data_d, &result[0], dataArray.count()); debug("Output FFTSHIFT(data):",result,imgDims); performFFTShift(data_d,gpuNUFFT::INVERSE,imgDims,&gi_host); copyFromDevice(data_d, &result[0], dataArray.count()); debug("Output IFFTSHIFT(FFTSHIFT(data)):",result,imgDims); for (unsigned i=0; i < imgDims.count(); i++) { EXPECT_NEAR(data[i].x,result[i].x,epsilon); EXPECT_NEAR(data[i].y,result[i].y,epsilon); } cudaFree(data_d); }
// ---------------------------------------------------------------------------- // gpuNUFFT_gpu: NUFFT // // Inverse gpuNUFFT implementation - interpolation from uniform grid data onto // nonuniform k-space data based on optimized // gpuNUFFT kernel with minimal oversampling // ratio (see Beatty et al.) // // Basic steps: - apodization correction // - zero padding with osf // - FFT // - convolution and resampling // // parameters: // * data : output kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinates on trajectory, passed as SoA // * imdata : input image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of data indices according to each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing forward gpuNUFFT!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; } showMemoryInfo(); if (debugTiming) startTiming(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); //cuda mem allocation DType2 *imdata_d; CufftType *data_d; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<DType2>(&imdata_d,imdata_count); if (DEBUG) printf("allocate and copy data of size %d...\n",data_count); allocateDeviceMem<CufftType>(&data_d,data_count); initDeviceMemory(n_coils); if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); int err; //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count; if (this->applySensData()) // perform automatically "repeating" of input image in case // of existing sensitivity data copyToDevice(imgData.data,imdata_d,imdata_count); else copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count); //reset temp arrays cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); cudaMemset(data_d,0, sizeof(CufftType)*data_count); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,false); } // apodization Correction if (n_coils > 1 && deapo_d != NULL) performForwardDeapodization(imdata_d,deapo_d,gi_host); else performForwardDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad performPadding(imdata_d,gdata_d,gi_host); if (debugTiming) startTiming(); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d // Forward FFT to kspace domain if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("FFT (incl. shift): %.2f ms\n",stopTiming()); if (debugTiming) startTiming(); // convolution and resampling to non-standard trajectory forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("Forward Convolution: %.2f ms\n",stopTiming()); performFFTScaling(data_d,gi_host->data_count,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); //write result in correct order back into output array writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count()); copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count); }//iterate over coils freeTotalDeviceMemory(data_d,imdata_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Runs the stablity test. The algorithm for the parallel // version of the test, which enables testing of an entire GPU // cluster at the same time, is as follows. Each participating node // first allocates its data, while node zero additionally determines // start and finish times based on a user input parameter. All nodes // then enter the outermost loop, copying fresh data from the CPU // before entering the core of the test. In the core, each node // performs a loop consisting of the forward kernel, a potential // check, and then the inverse kernel. After performing a configurable // number of forward/inverse iterations, along with a configurable // number of checks, each node sends the number of failures it // encountered to node zero. Node zero collects and reports the error // counts, determines whether the test has run its course, and // broadcasts the decision. If the decision is to proceed, each node // begins the next iteration of the outer loop, copying fresh data and // then performing the kernels and checks of the core loop. // // Arguments: // resultDB: the benchmark stores its results in this ResultDatabase // op: the options parser / parameter database // // Returns: nothing // // Programmer: Collin McCurdy // Creation: September 08, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(ResultDatabase &resultDB, OptionParser& op) { int mpi_rank, mpi_size, node_rank; int i, j; float2* source, * result; void* work, * chk; #ifdef PARALLEL MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); NodeInfo NI; node_rank = NI.nodeRank(); cout << "MPI Task " << mpi_rank << " of " << mpi_size << " (noderank=" << node_rank << ") starting....\n"; #else mpi_rank = 0; mpi_size = 1; node_rank = 0; #endif // ensure chk buffer alloc succeeds before grabbing the // rest of available memory. allocDeviceBuffer(&chk, 1); unsigned long avail_bytes = findAvailBytes(); // unsigned long avail_bytes = 1024*1024*1024-1; // now determine how much available memory will be used (subject // to CUDA's constraint on the maximum block dimension size) int blocks = avail_bytes / (512*sizeof(float2)); int slices = 1; while (blocks/slices > 65535) { slices *= 2; } int half_n_ffts = ((blocks/slices)*slices)/2; int n_ffts = half_n_ffts * 2; fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", avail_bytes, blocks, n_ffts); int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2); cout << mpi_rank << ": testing " << used_bytes/((double)1024*1024) << " MBs\n"; // allocate host memory source = (float2*)malloc(used_bytes); result = (float2*)malloc(used_bytes); // alloc device memory allocDeviceBuffer(&work, used_bytes); // alloc gather buffer int* recvbuf = (int*)malloc(mpi_size*sizeof(int)); // compute start and finish times time_t start = time(NULL); time_t finish = start + (time_t)(op.getOptionInt("time")*60); struct tm start_tm, finish_tm; localtime_r(&start, &start_tm); localtime_r(&finish, &finish_tm); if (mpi_rank == 0) { printf("start = %s", asctime(&start_tm)); printf("finish = %s", asctime(&finish_tm)); } for (int iter = 0; ; iter++) { bool failed = false; int errorCount = 0, stop = 0; // (re-)init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // copy to device copyToDevice(work, source, used_bytes); copyToDevice(chk, &errorCount, 1); forward(work, n_ffts); if (check(work, chk, half_n_ffts, half_n_cmplx)) { fprintf(stderr, "First check failed..."); failed = true; } if (!failed) { for (i = 1; i <= CHECKS; i++) { for (j = 1; j <= ITERS_PER_CHECK; j++) { inverse(work, n_ffts); forward(work, n_ffts); } if (check(work, chk, half_n_ffts, half_n_cmplx)) { failed = true; break; } } } // failing node is responsible for verifying failure, counting // errors and reporting count to node 0. if (failed) { fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter); // repeat check on CPU copyFromDevice(result, work, used_bytes); float2* result2 = result + half_n_cmplx; for (j = 0; j < half_n_cmplx; j++) { if (result[j].x != result2[j].x || result[j].y != result2[j].y) { errorCount++; } } if (!errorCount) { fprintf(stderr, "verification failed!\n"); } else { fprintf(stderr, "%d errors\n", errorCount); } } #ifdef PARALLEL MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); #else recvbuf[0] = errorCount; #endif // node 0 collects and reports error counts, determines // whether test has run its course, and broadcasts decision if (mpi_rank == 0) { time_t curtime = time(NULL); struct tm curtm; localtime_r(&curtime, &curtm); fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm)); for (i = 0; i < mpi_size; i++) { if (recvbuf[i]) { fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i); } } if (curtime > finish) { stop = 1; } } #ifdef PARALLEL MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif resultDB.AddResult("Check", "", "Failures", errorCount); if (stop) break; } freeDeviceBuffer(work); freeDeviceBuffer(chk); free(source); free(result); free(recvbuf); }
TEST(TestForwardBackward,Test_GpuArray) { //Test the same as above but use GpuArray data structure int kernel_width = 3; float osf = 1.25;//oversampling ratio int sector_width = 8; //Data int data_entries = 2; DType2* data = (DType2*) calloc(data_entries,sizeof(DType2)); //2* re + im data[0].x = 5;//Re data[0].y = 0;//Im data[1].x = 1;//Re data[1].y = 0;//Im //Coords //Scaled between -0.5 and 0.5 //in triplets (x,y,z) as structure of array //p0 = (0,0,0) //p1 0 (0.25,0.25,0.25) DType* coords = (DType*) calloc(3*data_entries,sizeof(DType));//3* x,y,z coords[0] = 0.00; //x0 coords[1] = 0.25; //x1 coords[2] = 0.00; //y0 coords[3] = 0.25; //y0 coords[4] = 0.00; //z0 coords[5] = 0.25; //z1 //Input data array, complex values //and copy to GPU gpuNUFFT::GpuArray<DType2> dataArray_gpu; dataArray_gpu.dim.length = data_entries; allocateAndCopyToDeviceMem<DType2>(&dataArray_gpu.data,data,data_entries); //Input array containing trajectory in k-space gpuNUFFT::Array<DType> kSpaceData; kSpaceData.data = coords; kSpaceData.dim.length = data_entries; gpuNUFFT::Dimensions imgDims; imgDims.width = 64; imgDims.height = 64; imgDims.depth = 64; //precomputation performed by factory gpuNUFFT::GpuNUFFTOperatorFactory factory; gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp = factory.createGpuNUFFTOperator(kSpaceData,kernel_width,sector_width,osf,imgDims); //Output Array gpuNUFFT::GpuArray<CufftType> imgArray_gpu; imgArray_gpu.dim = imgDims; allocateDeviceMem<CufftType>(&imgArray_gpu.data,imgArray_gpu.count()); //Perform FT^H Operation gpuNUFFTOp->performGpuNUFFTAdj(dataArray_gpu, imgArray_gpu); //Perform FT Operation gpuNUFFTOp->performForwardGpuNUFFT(imgArray_gpu,dataArray_gpu); copyFromDevice(dataArray_gpu.data,data,data_entries); printf("contrast %f \n",data[0].x/data[1].x); EXPECT_NEAR(data[0].x/data[1].x,5.0,epsilon); free(data); free(coords); freeTotalDeviceMemory(dataArray_gpu.data,imgArray_gpu.data,NULL); delete gpuNUFFTOp; }
void bluesteinsFFTGpu(const char* const argv[],const unsigned n, const unsigned orign,const unsigned size) { const unsigned powM = (unsigned) log2(n); printf("Compiling Bluesteins Program..\n"); compileProgram(argv, "fft.h", "kernels/bluesteins.cl"); printf("Creating Kernel\n"); for (unsigned i = 0; i < deviceCount; ++i) { createKernel(i, "bluesteins"); } const unsigned sizePerGPU = size / deviceCount; for (unsigned i = 0; i < deviceCount; ++i) { workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU : (size - workOffset[i]); allocateDeviceMemoryBS(i , workSize[i], workOffset[i]); clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]); clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]); clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]); clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]); clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]); clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]); clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n); clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign); clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM); clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize); if ((i + 1) < deviceCount) { workOffset[i + 1] = workOffset[i] + workSize[i]; } } size_t localWorkSize[] = {blockSize}; for (unsigned i = 0; i < deviceCount; ++i) { size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; // kernel non blocking execution runKernel(i, localWorkSize, globalWorkSize); } h_Rreal = h_Hreal; h_Rimag = h_Himag; for (unsigned i = 0; i < deviceCount; ++i) { copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i], workSize[i]); copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i], workSize[i]); } // wait for copy event const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone); checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents"); printGpuTime(); }