Esempio n. 1
0
TEST_F(TestFFT,Test8x11x4)
{
	gpuNUFFT::Dimensions imgDims;
	imgDims.width = 8;
	imgDims.height = 11;
	imgDims.depth = 4;
   
  //Input data array, complex values
	gpuNUFFT::Array<CufftType> dataArray;
	dataArray.data = &data[0];
	dataArray.dim = imgDims;

  CufftType* data_d;
  allocateAndCopyToDeviceMem<CufftType>(&data_d,dataArray.data,dataArray.count());

  gpuNUFFT::GpuNUFFTInfo gi_host;

  gi_host.is2Dprocessing = false;

  gi_host.gridDims.x = imgDims.width;
  gi_host.gridDims.y = imgDims.height;
  gi_host.gridDims.z = imgDims.depth;
  gi_host.n_coils_cc = 1;

  initConstSymbol("GI",&gi_host,sizeof(gpuNUFFT::GpuNUFFTInfo));

  debug("Input:", data, imgDims);
  
  performFFTShift(data_d, gpuNUFFT::FORWARD, imgDims, &gi_host);

  std::vector<CufftType> result(imgDims.count());
  copyFromDevice(data_d, &result[0], dataArray.count());

  debug("Output FFTSHIFT(data):",result,imgDims);

  performFFTShift(data_d,gpuNUFFT::INVERSE,imgDims,&gi_host);
  copyFromDevice(data_d, &result[0], dataArray.count());

  debug("Output IFFTSHIFT(FFTSHIFT(data)):",result,imgDims);


  for (unsigned i=0; i < imgDims.count(); i++)
  {
    EXPECT_NEAR(data[i].x,result[i].x,epsilon);
    EXPECT_NEAR(data[i].y,result[i].y,epsilon);
  }

  cudaFree(data_d);
}
Esempio n. 2
0
// ----------------------------------------------------------------------------
// gpuNUFFT_gpu: NUFFT
//
// Inverse gpuNUFFT implementation - interpolation from uniform grid data onto 
//                                   nonuniform k-space data based on optimized 
//                                   gpuNUFFT kernel with minimal oversampling
//                                   ratio (see Beatty et al.)
//
// Basic steps: - apodization correction
//              - zero padding with osf
//              - FFT
//							- convolution and resampling
//
// parameters:
//	* data		     : output kspace data 
//  * data_count   : number of samples on trajectory
//  * n_coils      : number of channels or coils
//  * crds         : coordinates on trajectory, passed as SoA
//  * imdata       : input image data
//  * imdata_count : number of image data points
//  * grid_width   : size of grid 
//  * kernel       : precomputed convolution kernel as lookup table
//  * kernel_count : number of kernel lookup table entries
//  * sectors      : mapping of data indices according to each sector
//  * sector_count : number of sectors
//  * sector_centers: coordinates (x,y,z) of sector centers
//  * sector_width : width of sector 
//  * im_width     : dimension of image
//  * osr          : oversampling ratio
//  * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed
//  
void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut)
{
  if (DEBUG)
  {
    std::cout << "performing forward gpuNUFFT!!!" << std::endl;
    std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl;
    std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl;
  }
  showMemoryInfo();
  
  if (debugTiming)
    startTiming();

  int			data_count          = (int)this->kSpaceTraj.count();
  int			n_coils             = (int)kspaceData.dim.channels;
  IndType	imdata_count        = this->imgDims.count();
  int			sector_count        = (int)this->gridSectorDims.count();

  //cuda mem allocation
  DType2 *imdata_d;
  CufftType *data_d;

  if (DEBUG)
    printf("allocate and copy imdata of size %d...\n",imdata_count);
  allocateDeviceMem<DType2>(&imdata_d,imdata_count);

  if (DEBUG)
    printf("allocate and copy data of size %d...\n",data_count);
  allocateDeviceMem<CufftType>(&data_d,data_count);

  initDeviceMemory(n_coils);
    
  if (debugTiming)
    printf("Memory allocation: %.2f ms\n",stopTiming());

  int err;

  //iterate over coils and compute result
  for (int coil_it = 0; coil_it < n_coils; coil_it++)
  {
    int data_coil_offset = coil_it * data_count;
    int im_coil_offset = coil_it * (int)imdata_count;

    if (this->applySensData())
      // perform automatically "repeating" of input image in case
      // of existing sensitivity data
      copyToDevice(imgData.data,imdata_d,imdata_count);
    else
      copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count);

    //reset temp arrays
    cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim);
    cudaMemset(data_d,0, sizeof(CufftType)*data_count);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (this->applySensData())
    {
      copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count);
      performSensMul(imdata_d,sens_d,gi_host,false);
    }

    // apodization Correction
    if (n_coils > 1 && deapo_d != NULL)
      performForwardDeapodization(imdata_d,deapo_d,gi_host);
    else
      performForwardDeapodization(imdata_d,gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError()));
    // resize by oversampling factor and zero pad
    performPadding(imdata_d,gdata_d,gi_host);

    if (debugTiming)
      startTiming();

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError()));
    // shift image to get correct zero frequency position
    performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError()));
    // eventually free imdata_d
    // Forward FFT to kspace domain
    if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS)
    {
      fprintf(stderr,"cufft has failed with err %i \n",err);
      showMemoryInfo(true,stderr);
    }

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError()));
    performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host);

    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError()));

    if (debugTiming)
      printf("FFT (incl. shift): %.2f ms\n",stopTiming());

    if (debugTiming)
      startTiming();

    // convolution and resampling to non-standard trajectory
    forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    if (debugTiming)
      printf("Forward Convolution: %.2f ms\n",stopTiming());

    performFFTScaling(data_d,gi_host->data_count,gi_host);
    if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
      printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError()));
    
    //write result in correct order back into output array
    writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count());
    copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count);
  }//iterate over coils
  
  freeTotalDeviceMemory(data_d,imdata_d,NULL);
  freeDeviceMemory(n_coils);

  if ((cudaThreadSynchronize() != cudaSuccess))
    fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError()));
  free(gi_host);
}
Esempio n. 3
0
// ****************************************************************************
// Function: RunBenchmark
//
// Purpose: 
//   Runs the stablity test. The algorithm for the parallel
//   version of the test, which enables testing of an entire GPU
//   cluster at the same time, is as follows. Each participating node
//   first allocates its data, while node zero additionally determines
//   start and finish times based on a user input parameter. All nodes
//   then enter the outermost loop, copying fresh data from the CPU
//   before entering the core of the test. In the core, each node
//   performs a loop consisting of the forward kernel, a potential
//   check, and then the inverse kernel. After performing a configurable
//   number of forward/inverse iterations, along with a configurable
//   number of checks, each node sends the number of failures it
//   encountered to node zero. Node zero collects and reports the error
//   counts, determines whether the test has run its course, and
//   broadcasts the decision. If the decision is to proceed, each node
//   begins the next iteration of the outer loop, copying fresh data and
//   then performing the kernels and checks of the core loop.
//
// Arguments:
//   resultDB: the benchmark stores its results in this ResultDatabase
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Collin McCurdy
// Creation: September 08, 2009
//
// Modifications:
//
// ****************************************************************************
void RunBenchmark(ResultDatabase &resultDB, OptionParser& op)
{	
    int mpi_rank, mpi_size, node_rank;
    int i, j;
    float2* source, * result;
    void* work, * chk;

#ifdef PARALLEL
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    NodeInfo NI;
    node_rank = NI.nodeRank();

    cout << "MPI Task " << mpi_rank << " of " << mpi_size 
         << " (noderank=" << node_rank << ") starting....\n";
#else
    mpi_rank = 0;
    mpi_size = 1;
    node_rank = 0;
#endif
   
    // ensure chk buffer alloc succeeds before grabbing the
    // rest of available memory.
    allocDeviceBuffer(&chk, 1);
    unsigned long avail_bytes = findAvailBytes();
    // unsigned long avail_bytes = 1024*1024*1024-1;
    
    // now determine how much available memory will be used (subject
    // to CUDA's constraint on the maximum block dimension size)
    int blocks = avail_bytes / (512*sizeof(float2));
    int slices = 1;
    while (blocks/slices > 65535) {
        slices *= 2;
    }
    int half_n_ffts = ((blocks/slices)*slices)/2;
    int n_ffts = half_n_ffts * 2;
    fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", 
            avail_bytes, blocks, n_ffts);

    int half_n_cmplx = half_n_ffts * 512;
    unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2);

    cout << mpi_rank << ": testing " 
         << used_bytes/((double)1024*1024) << " MBs\n";

    // allocate host memory
    source = (float2*)malloc(used_bytes);
    result = (float2*)malloc(used_bytes);

    // alloc device memory
    allocDeviceBuffer(&work, used_bytes);

    // alloc gather buffer
    int* recvbuf = (int*)malloc(mpi_size*sizeof(int));
    
    // compute start and finish times
    time_t start = time(NULL);
    time_t finish = start + (time_t)(op.getOptionInt("time")*60);
    struct tm start_tm, finish_tm;
    localtime_r(&start, &start_tm);
    localtime_r(&finish, &finish_tm);
    if (mpi_rank == 0) {
        printf("start = %s", asctime(&start_tm));
        printf("finish = %s", asctime(&finish_tm));
    }
    
    for (int iter = 0; ; iter++) {
        bool failed = false;
        int errorCount = 0, stop = 0;

        // (re-)init host memory...
        for (i = 0; i < half_n_cmplx; i++) {
            source[i].x = (rand()/(float)RAND_MAX)*2-1;
            source[i].y = (rand()/(float)RAND_MAX)*2-1;
            source[i+half_n_cmplx].x = source[i].x;
            source[i+half_n_cmplx].y = source[i].y;
        }

        // copy to device
        copyToDevice(work, source, used_bytes);
        copyToDevice(chk, &errorCount, 1);

        forward(work, n_ffts);
        if (check(work, chk, half_n_ffts, half_n_cmplx)) {
            fprintf(stderr, "First check failed...");
            failed = true;
        }

        if (!failed) {
            for (i = 1; i <= CHECKS; i++) {
                for (j = 1; j <= ITERS_PER_CHECK; j++) {
                    inverse(work, n_ffts);
                    forward(work, n_ffts);
                }

                if (check(work, chk, half_n_ffts, half_n_cmplx)) {
                    failed = true;
                    break;
                }
            }
        }

        // failing node is responsible for verifying failure, counting
        // errors and reporting count to node 0.
        if (failed) {
            fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter);
            
            // repeat check on CPU
            copyFromDevice(result, work, used_bytes);
            float2* result2 = result + half_n_cmplx;
            for (j = 0; j < half_n_cmplx; j++) {
                if (result[j].x != result2[j].x || 
                    result[j].y != result2[j].y) 
                {
                    errorCount++;
                }
            }
            if (!errorCount) {
                fprintf(stderr, "verification failed!\n");
            }
            else {
                fprintf(stderr, "%d errors\n", errorCount);
            }
        }

#ifdef PARALLEL
        MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 
                   0, MPI_COMM_WORLD);
#else
        recvbuf[0] = errorCount;
#endif

        // node 0 collects and reports error counts, determines
        // whether test has run its course, and broadcasts decision
        if (mpi_rank == 0) {
            time_t curtime = time(NULL);
            struct tm curtm;
            localtime_r(&curtime, &curtm);
            fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm));
            
            for (i = 0; i < mpi_size; i++) {
                if (recvbuf[i]) {
                    fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i);
                }
            }

            if (curtime > finish) {
                stop = 1;
            }
        }
        
#ifdef PARALLEL
        MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD);
#endif

        resultDB.AddResult("Check", "", "Failures", errorCount);
        if (stop) break;
    }
    
    freeDeviceBuffer(work);
    freeDeviceBuffer(chk);

    free(source);
    free(result);
    free(recvbuf);
}
Esempio n. 4
0
TEST(TestForwardBackward,Test_GpuArray)
{
  //Test the same as above but use GpuArray data structure  

  int kernel_width = 3;
	float osf = 1.25;//oversampling ratio
	int sector_width = 8;
	
	//Data
	int data_entries = 2;
  DType2* data = (DType2*) calloc(data_entries,sizeof(DType2)); //2* re + im
	data[0].x = 5;//Re
	data[0].y = 0;//Im
	data[1].x = 1;//Re
	data[1].y = 0;//Im

	//Coords
	//Scaled between -0.5 and 0.5
	//in triplets (x,y,z) as structure of array
	//p0 = (0,0,0)
	//p1 0 (0.25,0.25,0.25)
  DType* coords = (DType*) calloc(3*data_entries,sizeof(DType));//3* x,y,z
	coords[0] = 0.00; //x0
	coords[1] = 0.25; //x1
	
	coords[2] = 0.00; //y0
	coords[3] = 0.25; //y0
	
	coords[4] = 0.00; //z0
	coords[5] = 0.25; //z1

  //Input data array, complex values
  //and copy to GPU 
	gpuNUFFT::GpuArray<DType2> dataArray_gpu;
	dataArray_gpu.dim.length = data_entries;

  allocateAndCopyToDeviceMem<DType2>(&dataArray_gpu.data,data,data_entries);
	
	//Input array containing trajectory in k-space
	gpuNUFFT::Array<DType> kSpaceData;
  kSpaceData.data = coords;
  kSpaceData.dim.length = data_entries;

	gpuNUFFT::Dimensions imgDims;
	imgDims.width = 64;
	imgDims.height = 64;
	imgDims.depth = 64;

  //precomputation performed by factory
  gpuNUFFT::GpuNUFFTOperatorFactory factory; 
  gpuNUFFT::GpuNUFFTOperator *gpuNUFFTOp = factory.createGpuNUFFTOperator(kSpaceData,kernel_width,sector_width,osf,imgDims);

	//Output Array
	gpuNUFFT::GpuArray<CufftType> imgArray_gpu;
	imgArray_gpu.dim = imgDims;
  allocateDeviceMem<CufftType>(&imgArray_gpu.data,imgArray_gpu.count());
	
	//Perform FT^H Operation
	gpuNUFFTOp->performGpuNUFFTAdj(dataArray_gpu, imgArray_gpu);
	
	//Perform FT Operation
  gpuNUFFTOp->performForwardGpuNUFFT(imgArray_gpu,dataArray_gpu);
	copyFromDevice(dataArray_gpu.data,data,data_entries);
	
	printf("contrast %f \n",data[0].x/data[1].x);
  EXPECT_NEAR(data[0].x/data[1].x,5.0,epsilon);

	free(data);
	free(coords);
  freeTotalDeviceMemory(dataArray_gpu.data,imgArray_gpu.data,NULL);

  delete gpuNUFFTOp;
}
Esempio n. 5
0
void
bluesteinsFFTGpu(const char* const argv[],const unsigned n, 
		 const unsigned orign,const unsigned size)
{
  const unsigned powM = (unsigned) log2(n);
  printf("Compiling Bluesteins Program..\n");

  compileProgram(argv, "fft.h", "kernels/bluesteins.cl");

    printf("Creating Kernel\n");
    for (unsigned i = 0; i < deviceCount; ++i) {
        createKernel(i, "bluesteins");
    }

    const unsigned sizePerGPU = size / deviceCount;
    for (unsigned i = 0; i < deviceCount; ++i) {
        workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU 
                                               : (size - workOffset[i]);       
        
        allocateDeviceMemoryBS(i , workSize[i], workOffset[i]);
        
        clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]);
        clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]);
	clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]);
        clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]);
	clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]);
        clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]);
	clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n);
	clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign);
	clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM);
	clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize);
       

        if ((i + 1) < deviceCount) {
            workOffset[i + 1] = workOffset[i] + workSize[i];
        } 

    }

    size_t localWorkSize[] = {blockSize};
    for (unsigned i = 0; i < deviceCount; ++i) {
        size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; 
        // kernel non blocking execution 
        runKernel(i, localWorkSize, globalWorkSize);
    }

    h_Rreal = h_Hreal;
    h_Rimag = h_Himag;
    
    for (unsigned i = 0; i < deviceCount; ++i) {
        copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i],
                                                workSize[i]); 
        copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i],
                                                 workSize[i]);
    }

    // wait for copy event
    const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone);
    checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents");
    printGpuTime();
}