/* * Class: jcuda_jcufft_JCufft * Method: cufftExecR2CNative * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecR2CNative (JNIEnv *env, jclass cla, jobject handle, jobject rIdata, jobject cOdata) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecR2C"); return JCUFFT_INTERNAL_ERROR; } if (rIdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'rIdata' is null for cufftExecR2C"); return JCUFFT_INTERNAL_ERROR; } if (cOdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecR2C"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Executing cufftExecR2C\n"); cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan); float* nativeRIData = (float*)getPointer(env, rIdata); cufftComplex* nativeCOData = (cufftComplex*)getPointer(env, cOdata); cufftResult result = cufftExecR2C(nativePlan, nativeRIData, nativeCOData); return result; }
void transformStack(const FreeImageStack & rImageStack, FourierImageStack & rFourierStack) { unsigned int nMaxSlices = rImageStack.slices(); if (nMaxSlices > rFourierStack.slices()) nMaxSlices = rFourierStack.slices(); NppiSize oSizeROI = {rImageStack.width(), rImageStack.height()}; // create plan for the FFT cufftHandle oPlanCUFFT; NPP_CHECK_CUFFT(cufftPlan2d(&oPlanCUFFT, oSizeROI.width, oSizeROI.height, CUFFT_R2C)); // allocate 32-bit float intermediate image // for this image to work with cuFFT, we must have tightly packed pixels. npp::ImageNPP<Npp32f, 1, FrugalAllocator_32f_C1> oSource_32f_C1(oSizeROI.width, oSizeROI.height); NPP_DEBUG_ASSERT(oSource_32f_C1.width() * sizeof(Npp32f) == oSource_32f_C1.pitch()); // allocate 8-bit image npp::ImageNPP_8u_C1 oSource_8u_C1; for (unsigned int iSlice = 0; iSlice < nMaxSlices; ++iSlice) { // load slice rImageStack.loadImage(iSlice, oSource_8u_C1); // upconvert 8-bit image to 32-bit float image NPP_CHECK_NPP(nppiConvert_8u32f_C1R(oSource_8u_C1.data(), oSource_8u_C1.pitch(), oSource_32f_C1.data(), oSource_32f_C1.pitch(), oSizeROI)); NPP_CHECK_CUFFT(cufftExecR2C(oPlanCUFFT, oSource_32f_C1.data(), reinterpret_cast<cufftComplex *>(rFourierStack.data(iSlice)))); } }
void sararfftnd_one_real_to_complex( sararfftnd_plan plan, sarafft_real *h_data ) { CUdeviceptr d_data; size_t planSize = getPlanSize( plan ); // printf( "planSize = %li!\n", planSize ); // fflush ( stdout ); cufftResult fftResult; CUresult cudaResult; if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) { printf( "cuMemAlloc failed for plansize %li!\n", planSize ); fflush ( stdout ); exit( 85 ); } if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) { printf( "cuMemcpyHtoD failed!\n" ); fflush ( stdout ); exit( 86 ); } // cudaError_t cudaError = cudaGetLastError(); // if( cudaError != cudaSuccess ) { // printf( "CUDA Runtime API Error reported : %s\n", cudaGetErrorString(cudaError)); // fflush ( stdout ); // exit( 87 ); // } else { // printf( "CUDA is in good shape.\n"); // fflush ( stdout ); // } fftResult = cufftExecR2C( plan, ( cufftReal* )d_data, ( cufftComplex* )d_data ); if ( CUFFT_SUCCESS != fftResult ) { printf( "cufftExecR2C failed with code %d\n", fftResult ); fflush ( stdout ); exit( 87 ); } if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) { printf( "cuMemcpyDtoH failed!\n" ); fflush ( stdout ); exit( 88 ); } if ( CUDA_SUCCESS != cuMemFree( d_data ) ) { printf( "cuMemFree failed!\n" ); fflush ( stdout ); exit( 89 ); } }
void forward(const GPUMatrixPitched<TSignalType>& x, GPUMatrixPitched<TSpectralType>& fft_x) { // A GPU matrix is aligned to CRS_BLOCK_SIZE elements but the CuFFT // takes the elements in a contiguous form in row-major order. Thus, // we can only be sure that the CuFFT works as intended, if the aligned // and the contiguous storage are the same. AGILE_ASSERT(x.getNumColumns() == x.getPitchElements(), StandardException::ExceptionMessage( "FFT of non-aligned matrices not implemented, yet")); // create a plan createPlan(x.getNumRows(), x.getNumColumns()); cufftResult result = cufftExecR2C( m_plan, (typename to_cufft_type<TSignalType>::type*)x.data(), (typename to_cufft_type<TSpectralType>::type*)fft_x.data()); AGILE_ASSERT(result == CUFFT_SUCCESS, StandardException::ExceptionMessage( "Error during FFT procedure")); // scale(TSpectralType(1./std::sqrt(x.getNumRows() * x.getNumColumns())), // fft_x, fft_x); }
bool test0(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; fComplex *d_DataSpectrum, *d_KernelSpectrum; cufftHandle fftPlanFwd, fftPlanInv; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing built-in R2C / C2R FFT-based convolution\n"); const int kernelH = 3; const int kernelW = 3; const int kernelY = 1; const int kernelX = 1; const int dataH = 10; const int dataW = 10; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { //h_Data[i] = getRand(); h_Data[i] = i + 1; } for (int i = 0; i < kernelH * kernelW; i++) { //h_Kernel[i] = getRand(); h_Kernel[i] = i + 1; } FILE* fp2 = fopen("input_kernel.txt", "w+"); FILE* fp3 = fopen("input_data.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) fprintf(fp3, "%f\n", h_Data[i]); for (int i = 0; i < kernelH * kernelW; i++) fprintf(fp2, "%f\n", h_Kernel[i]); fclose(fp2); fclose(fp3); printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum)); modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU convolution results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlanInv)); checkCudaErrors(cufftDestroy(fftPlanFwd)); checkCudaErrors(cudaFree(d_DataSpectrum)); checkCudaErrors(cudaFree(d_KernelSpectrum)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_Data)); checkCudaErrors(cudaFree(d_Kernel)); FILE* fp = fopen("result_gpu.txt", "w+"); FILE* fp1 = fopen("result_cpu.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) { fprintf(fp, "%f\n", h_ResultGPU[i]); fprintf(fp1, "%f\n", h_ResultCPU[i]); } fclose(fp); fclose(fp1); free(h_ResultGPU); free(h_ResultCPU); free(h_Data); free(h_Kernel); return bRetVal; }
cufftResult WINAPI wine_cufftExecR2C(cufftHandle plan, cufftReal *idata, cufftComplex *odata){ WINE_TRACE("\n"); return cufftExecR2C( plan, idata, odata ); }
void accfft_execute_gpuf(accfft_plan_gpuf* plan, int direction,float * data_d, float * data_out_d, double * timer,std::bitset<3> xyz){ if(data_d==NULL) data_d=plan->data; if(data_out_d==NULL) data_out_d=plan->data_out; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); int NY=plan->N[1]; float dummy_time=0; int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1; //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i; //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecR2C(plan->fplan_0,(cufftReal*)data_d, (cufftComplex*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; // Perform N0/P0 transpose if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]); } /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]); } MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // IFFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecC2R(plan->iplan_0,(cufftComplex*)data_d,(cufftReal*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } MPI_Barrier(plan->c_comm); return; }