cl_mem *CLWrapper::getDeviceArray() { if( !onDevice ) { if(!onHost ) { throw std::runtime_error("getDeviceArray(): not on device, and not on host"); } // std::cout << "copy array to device of " << N << " elements" << std::endl; copyToDevice(); } return &devicearray; }
void allocateDeviceMemoryBS(const unsigned device, const unsigned size, const unsigned copyOffset) { d_Hreal[device] = createDeviceBuffer( CL_MEM_READ_ONLY, sizeof(float) * size, h_Hreal + copyOffset); copyToDevice(device, d_Hreal[device], h_Hreal + copyOffset, size); d_Himag[device] = createDeviceBuffer( CL_MEM_READ_ONLY, sizeof(float) * size, h_Himag + copyOffset); copyToDevice(device, d_Himag[device], h_Himag + copyOffset, size); d_Yreal[device] = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Yreal + copyOffset); copyToDevice(device, d_Yreal[device], h_Yreal + copyOffset, size); d_Yimag[device] = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Yimag + copyOffset); copyToDevice(device, d_Yimag[device], h_Yimag + copyOffset, size); d_Zreal[device] = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Zreal + copyOffset); copyToDevice(device, d_Zreal[device], h_Zreal + copyOffset, size); d_Zimag[device] = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Zimag + copyOffset); copyToDevice(device, d_Zimag[device], h_Zimag + copyOffset, size); }
// ---------------------------------------------------------------------------- // gpuNUFFT_gpu: NUFFT // // Inverse gpuNUFFT implementation - interpolation from uniform grid data onto // nonuniform k-space data based on optimized // gpuNUFFT kernel with minimal oversampling // ratio (see Beatty et al.) // // Basic steps: - apodization correction // - zero padding with osf // - FFT // - convolution and resampling // // parameters: // * data : output kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinates on trajectory, passed as SoA // * imdata : input image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of data indices according to each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performForwardGpuNUFFT(gpuNUFFT::Array<DType2> imgData,gpuNUFFT::Array<CufftType>& kspaceData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing forward gpuNUFFT!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; } showMemoryInfo(); if (debugTiming) startTiming(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); //cuda mem allocation DType2 *imdata_d; CufftType *data_d; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<DType2>(&imdata_d,imdata_count); if (DEBUG) printf("allocate and copy data of size %d...\n",data_count); allocateDeviceMem<CufftType>(&data_d,data_count); initDeviceMemory(n_coils); if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); int err; //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count; if (this->applySensData()) // perform automatically "repeating" of input image in case // of existing sensitivity data copyToDevice(imgData.data,imdata_d,imdata_count); else copyToDevice(imgData.data + im_coil_offset,imdata_d,imdata_count); //reset temp arrays cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); cudaMemset(data_d,0, sizeof(CufftType)*data_count); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,false); } // apodization Correction if (n_coils > 1 && deapo_d != NULL) performForwardDeapodization(imdata_d,deapo_d,gi_host); else performForwardDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); // resize by oversampling factor and zero pad performPadding(imdata_d,gdata_d,gi_host); if (debugTiming) startTiming(); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); // shift image to get correct zero frequency position performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); // eventually free imdata_d // Forward FFT to kspace domain if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_FORWARD) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,FORWARD,getGridDims(),gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("FFT (incl. shift): %.2f ms\n",stopTiming()); if (debugTiming) startTiming(); // convolution and resampling to non-standard trajectory forwardConvolution(data_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) printf("Forward Convolution: %.2f ms\n",stopTiming()); performFFTScaling(data_d,gi_host->data_count,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); //write result in correct order back into output array writeOrderedGPU(data_sorted_d,data_indices_d,data_d,(int)this->kSpaceTraj.count()); copyFromDevice(data_sorted_d,kspaceData.data + data_coil_offset,data_count); }//iterate over coils freeTotalDeviceMemory(data_d,imdata_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in performForwardGpuNUFFT function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
// ---------------------------------------------------------------------------- // performGpuNUFFTAdj: NUFFT^H // // GpuNUFFT implementation - interpolation from nonuniform k-space data onto // oversampled grid based on optimized gpuNUFFT kernel // with minimal oversampling ratio (see Beatty et al.) // // Basic steps: - density compensation // - convolution with interpolation function // - iFFT // - cropping due to oversampling ratio // - apodization correction // // parameters: // * data : input kspace data // * data_count : number of samples on trajectory // * n_coils : number of channels or coils // * crds : coordinate array on trajectory // * imdata : output image data // * imdata_count : number of image data points // * grid_width : size of grid // * kernel : precomputed convolution kernel as lookup table // * kernel_count : number of kernel lookup table entries // * sectors : mapping of start and end points of each sector // * sector_count : number of sectors // * sector_centers: coordinates (x,y,z) of sector centers // * sector_width : width of sector // * im_width : dimension of image // * osr : oversampling ratio // * do_comp : true, if density compensation has to be done // * density_comp : densiy compensation array // * gpuNUFFT_out : enum indicating how far gpuNUFFT has to be processed // void gpuNUFFT::GpuNUFFTOperator::performGpuNUFFTAdj(gpuNUFFT::Array<DType2> kspaceData, gpuNUFFT::Array<CufftType>& imgData, GpuNUFFTOutput gpuNUFFTOut) { if (DEBUG) { std::cout << "performing gpuNUFFT adjoint!!!" << std::endl; std::cout << "dataCount: " << kspaceData.count() << " chnCount: " << kspaceData.dim.channels << std::endl; std::cout << "imgCount: " << imgData.count() << " gridWidth: " << this->getGridWidth() << std::endl; std::cout << "apply density comp: " << this->applyDensComp() << std::endl; std::cout << "apply sens data: " << this->applySensData() << std::endl; } if (debugTiming) startTiming(); showMemoryInfo(); int data_count = (int)this->kSpaceTraj.count(); int n_coils = (int)kspaceData.dim.channels; IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); // select data ordered and leave it on gpu DType2* data_d; if (DEBUG) printf("allocate data of size %d...\n",data_count); allocateDeviceMem<DType2>(&data_d,data_count); CufftType *imdata_d, *imdata_sum_d = NULL; if (DEBUG) printf("allocate and copy imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_d,imdata_count); if (this->applySensData()) { if (DEBUG) printf("allocate and copy temp imdata of size %d...\n",imdata_count); allocateDeviceMem<CufftType>(&imdata_sum_d,imdata_count); cudaMemset(imdata_sum_d,0,imdata_count*sizeof(CufftType)); } initDeviceMemory(n_coils); int err; if (debugTiming) printf("Memory allocation: %.2f ms\n",stopTiming()); //iterate over coils and compute result for (int coil_it = 0; coil_it < n_coils; coil_it++) { int data_coil_offset = coil_it * data_count; int im_coil_offset = coil_it * (int)imdata_count;//gi_host->width_dim; cudaMemset(gdata_d,0, sizeof(CufftType)*gi_host->grid_width_dim); //copy coil data to device and select ordered copyToDevice(kspaceData.data + data_coil_offset,data_d,data_count); selectOrderedGPU(data_d,data_indices_d,data_sorted_d,data_count); if (this->applyDensComp()) performDensityCompensation(data_sorted_d,density_comp_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 1: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); adjConvolution(data_sorted_d,crds_d,gdata_d,NULL,sectors_d,sector_centers_d,gi_host); if (debugTiming) printf("Adjoint convolution: %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 2: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == CONVOLUTION) { if (DEBUG) printf("stopping output after CONVOLUTION step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); if (DEBUG) printf("test value at point zero: %f\n",(imgData.data)[0].x); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); return; } if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 3: %s\n",cudaGetErrorString(cudaGetLastError())); if (debugTiming) startTiming(); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); //Inverse FFT if (err=pt2CufftExec(fft_plan, gdata_d, gdata_d, CUFFT_INVERSE) != CUFFT_SUCCESS) { fprintf(stderr,"cufft has failed at adj with err %i \n",err); showMemoryInfo(true,stderr); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error at adj thread synchronization 4: %s\n",cudaGetErrorString(cudaGetLastError())); if (gpuNUFFTOut == FFT) { if (DEBUG) printf("stopping output after FFT step\n"); //get output copyFromDevice<CufftType>(gdata_d,imgData.data,gi_host->grid_width_dim); free(gi_host); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); printf("last cuda error: %s\n", cudaGetErrorString(cudaGetLastError())); return; } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 5: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTShift(gdata_d,INVERSE,getGridDims(),gi_host); if (debugTiming) printf("iFFT (incl. shift) : %.2f ms\n",stopTiming()); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 6: %s\n",cudaGetErrorString(cudaGetLastError())); performCrop(gdata_d,imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 7: %s\n",cudaGetErrorString(cudaGetLastError())); //check if precomputed deapo function can be used if (n_coils > 1 && deapo_d != NULL) performDeapodization(imdata_d,deapo_d,gi_host); else performDeapodization(imdata_d,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at adj thread synchronization 8: %s\n",cudaGetErrorString(cudaGetLastError())); performFFTScaling(imdata_d,gi_host->im_width_dim,gi_host); if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); if (this->applySensData()) { copyToDevice(this->sens.data + im_coil_offset, sens_d,imdata_count); performSensMul(imdata_d,sens_d,gi_host,true); performSensSum(imdata_d,imdata_sum_d,gi_host); } else { // get result per coil // no summation is performed in absence of sensitity data copyFromDevice<CufftType>(imdata_d,imgData.data+im_coil_offset,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 10: %s\n",cudaGetErrorString(cudaGetLastError())); }//iterate over coils if (this->applySensData()) { // get result of combined coils copyFromDevice<CufftType>(imdata_sum_d,imgData.data,imdata_count); } if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error: at adj thread synchronization 11: %s\n",cudaGetErrorString(cudaGetLastError())); freeTotalDeviceMemory(data_d,imdata_d,imdata_sum_d,NULL); freeDeviceMemory(n_coils); if ((cudaThreadSynchronize() != cudaSuccess)) fprintf(stderr,"error in gpuNUFFT_gpu_adj function: %s\n",cudaGetErrorString(cudaGetLastError())); free(gi_host); }
// **************************************************************************** // Function: RunBenchmark // // Purpose: // Runs the stablity test. The algorithm for the parallel // version of the test, which enables testing of an entire GPU // cluster at the same time, is as follows. Each participating node // first allocates its data, while node zero additionally determines // start and finish times based on a user input parameter. All nodes // then enter the outermost loop, copying fresh data from the CPU // before entering the core of the test. In the core, each node // performs a loop consisting of the forward kernel, a potential // check, and then the inverse kernel. After performing a configurable // number of forward/inverse iterations, along with a configurable // number of checks, each node sends the number of failures it // encountered to node zero. Node zero collects and reports the error // counts, determines whether the test has run its course, and // broadcasts the decision. If the decision is to proceed, each node // begins the next iteration of the outer loop, copying fresh data and // then performing the kernels and checks of the core loop. // // Arguments: // resultDB: the benchmark stores its results in this ResultDatabase // op: the options parser / parameter database // // Returns: nothing // // Programmer: Collin McCurdy // Creation: September 08, 2009 // // Modifications: // // **************************************************************************** void RunBenchmark(ResultDatabase &resultDB, OptionParser& op) { int mpi_rank, mpi_size, node_rank; int i, j; float2* source, * result; void* work, * chk; #ifdef PARALLEL MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); NodeInfo NI; node_rank = NI.nodeRank(); cout << "MPI Task " << mpi_rank << " of " << mpi_size << " (noderank=" << node_rank << ") starting....\n"; #else mpi_rank = 0; mpi_size = 1; node_rank = 0; #endif // ensure chk buffer alloc succeeds before grabbing the // rest of available memory. allocDeviceBuffer(&chk, 1); unsigned long avail_bytes = findAvailBytes(); // unsigned long avail_bytes = 1024*1024*1024-1; // now determine how much available memory will be used (subject // to CUDA's constraint on the maximum block dimension size) int blocks = avail_bytes / (512*sizeof(float2)); int slices = 1; while (blocks/slices > 65535) { slices *= 2; } int half_n_ffts = ((blocks/slices)*slices)/2; int n_ffts = half_n_ffts * 2; fprintf(stderr, "avail_bytes=%ld, blocks=%d, n_ffts=%d\n", avail_bytes, blocks, n_ffts); int half_n_cmplx = half_n_ffts * 512; unsigned long used_bytes = half_n_cmplx * 2 * sizeof(float2); cout << mpi_rank << ": testing " << used_bytes/((double)1024*1024) << " MBs\n"; // allocate host memory source = (float2*)malloc(used_bytes); result = (float2*)malloc(used_bytes); // alloc device memory allocDeviceBuffer(&work, used_bytes); // alloc gather buffer int* recvbuf = (int*)malloc(mpi_size*sizeof(int)); // compute start and finish times time_t start = time(NULL); time_t finish = start + (time_t)(op.getOptionInt("time")*60); struct tm start_tm, finish_tm; localtime_r(&start, &start_tm); localtime_r(&finish, &finish_tm); if (mpi_rank == 0) { printf("start = %s", asctime(&start_tm)); printf("finish = %s", asctime(&finish_tm)); } for (int iter = 0; ; iter++) { bool failed = false; int errorCount = 0, stop = 0; // (re-)init host memory... for (i = 0; i < half_n_cmplx; i++) { source[i].x = (rand()/(float)RAND_MAX)*2-1; source[i].y = (rand()/(float)RAND_MAX)*2-1; source[i+half_n_cmplx].x = source[i].x; source[i+half_n_cmplx].y = source[i].y; } // copy to device copyToDevice(work, source, used_bytes); copyToDevice(chk, &errorCount, 1); forward(work, n_ffts); if (check(work, chk, half_n_ffts, half_n_cmplx)) { fprintf(stderr, "First check failed..."); failed = true; } if (!failed) { for (i = 1; i <= CHECKS; i++) { for (j = 1; j <= ITERS_PER_CHECK; j++) { inverse(work, n_ffts); forward(work, n_ffts); } if (check(work, chk, half_n_ffts, half_n_cmplx)) { failed = true; break; } } } // failing node is responsible for verifying failure, counting // errors and reporting count to node 0. if (failed) { fprintf(stderr, "Failure on node %d, iter %d:", mpi_rank, iter); // repeat check on CPU copyFromDevice(result, work, used_bytes); float2* result2 = result + half_n_cmplx; for (j = 0; j < half_n_cmplx; j++) { if (result[j].x != result2[j].x || result[j].y != result2[j].y) { errorCount++; } } if (!errorCount) { fprintf(stderr, "verification failed!\n"); } else { fprintf(stderr, "%d errors\n", errorCount); } } #ifdef PARALLEL MPI_Gather(&errorCount, 1, MPI_INT, recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); #else recvbuf[0] = errorCount; #endif // node 0 collects and reports error counts, determines // whether test has run its course, and broadcasts decision if (mpi_rank == 0) { time_t curtime = time(NULL); struct tm curtm; localtime_r(&curtime, &curtm); fprintf(stderr, "iter=%d: %s", iter, asctime(&curtm)); for (i = 0; i < mpi_size; i++) { if (recvbuf[i]) { fprintf(stderr, "--> %d failures on node %d\n", recvbuf[i], i); } } if (curtime > finish) { stop = 1; } } #ifdef PARALLEL MPI_Bcast(&stop, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif resultDB.AddResult("Check", "", "Failures", errorCount); if (stop) break; } freeDeviceBuffer(work); freeDeviceBuffer(chk); free(source); free(result); free(recvbuf); }
OutProxy inOutCopy(cl::CommandQueue &queue, cl::Event &event) { copyToDevice(); return OutProxy(queue, *this, event); }
cl_mem inCopy(cl::CommandQueue &queue) { copyToDevice(); return m_buffer; }