Exemple #1
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftExecR2CNative
 * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecR2CNative
  (JNIEnv *env, jclass cla, jobject handle, jobject rIdata, jobject cOdata)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecR2C");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (rIdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'rIdata' is null for cufftExecR2C");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cOdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecR2C");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Executing cufftExecR2C\n");

    cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan);
    float* nativeRIData = (float*)getPointer(env, rIdata);
    cufftComplex* nativeCOData = (cufftComplex*)getPointer(env, cOdata);

    cufftResult result = cufftExecR2C(nativePlan, nativeRIData, nativeCOData);
    return result;
}
Exemple #2
0
void
transformStack(const FreeImageStack & rImageStack, FourierImageStack & rFourierStack)
{
    unsigned int nMaxSlices = rImageStack.slices();
    if (nMaxSlices > rFourierStack.slices())
        nMaxSlices = rFourierStack.slices();

    NppiSize oSizeROI = {rImageStack.width(), rImageStack.height()};
            // create plan for the FFT
    cufftHandle oPlanCUFFT;
    NPP_CHECK_CUFFT(cufftPlan2d(&oPlanCUFFT, oSizeROI.width, oSizeROI.height, CUFFT_R2C));
            // allocate 32-bit float intermediate image
            // for this image to work with cuFFT, we must have tightly packed pixels.
    npp::ImageNPP<Npp32f, 1, FrugalAllocator_32f_C1> oSource_32f_C1(oSizeROI.width, oSizeROI.height);
    NPP_DEBUG_ASSERT(oSource_32f_C1.width() * sizeof(Npp32f) == oSource_32f_C1.pitch());
            // allocate 8-bit image 
    npp::ImageNPP_8u_C1 oSource_8u_C1;
    for (unsigned int iSlice = 0; iSlice < nMaxSlices; ++iSlice)
    {
                // load slice
        rImageStack.loadImage(iSlice, oSource_8u_C1);        
                // upconvert 8-bit image to 32-bit float image
        NPP_CHECK_NPP(nppiConvert_8u32f_C1R(oSource_8u_C1.data(),  oSource_8u_C1.pitch(), 
                                            oSource_32f_C1.data(), oSource_32f_C1.pitch(),
                                            oSizeROI));
        NPP_CHECK_CUFFT(cufftExecR2C(oPlanCUFFT, oSource_32f_C1.data(), reinterpret_cast<cufftComplex *>(rFourierStack.data(iSlice))));
    }
}
Exemple #3
0
void sararfftnd_one_real_to_complex(
  sararfftnd_plan plan, sarafft_real *h_data
) {
  CUdeviceptr d_data;
  size_t planSize = getPlanSize( plan );
//   printf( "planSize = %li!\n", planSize );
//   fflush ( stdout );
  cufftResult fftResult;
  CUresult cudaResult;
  if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
    printf( "cuMemAlloc failed for plansize %li!\n", planSize );
    fflush ( stdout );
    exit( 85 );
  }
  if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
    printf( "cuMemcpyHtoD failed!\n" );
    fflush ( stdout );
    exit( 86 );
  }
//   cudaError_t cudaError = cudaGetLastError();
//   if( cudaError != cudaSuccess ) {
//     printf( "CUDA Runtime API Error reported : %s\n", cudaGetErrorString(cudaError));
//     fflush ( stdout );
//     exit( 87 );
//   } else {
//     printf( "CUDA is in good shape.\n");
//     fflush ( stdout );
//   }
  fftResult = cufftExecR2C( plan, ( cufftReal* )d_data, ( cufftComplex* )d_data );
  if ( CUFFT_SUCCESS != fftResult ) {
    printf( "cufftExecR2C failed with code %d\n", fftResult );
    fflush ( stdout );
    exit( 87 );
  }
  if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
    printf( "cuMemcpyDtoH failed!\n" );
    fflush ( stdout );
    exit( 88 );
  }
  if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
    printf( "cuMemFree failed!\n" );
    fflush ( stdout );
    exit( 89 );
  }
}
Exemple #4
0
      void forward(const GPUMatrixPitched<TSignalType>& x,
                   GPUMatrixPitched<TSpectralType>& fft_x)
      {
        // A GPU matrix is aligned to CRS_BLOCK_SIZE elements but the CuFFT
        // takes the elements in a contiguous form in row-major order. Thus,
        // we can only be sure that the CuFFT works as intended, if the aligned
        // and the contiguous storage are the same.
        AGILE_ASSERT(x.getNumColumns() == x.getPitchElements(),
                      StandardException::ExceptionMessage(
                        "FFT of non-aligned matrices not implemented, yet"));

        // create a plan
        createPlan(x.getNumRows(), x.getNumColumns());
        cufftResult result = cufftExecR2C(
          m_plan,
          (typename to_cufft_type<TSignalType>::type*)x.data(),
          (typename to_cufft_type<TSpectralType>::type*)fft_x.data());
          AGILE_ASSERT(result == CUFFT_SUCCESS,
                        StandardException::ExceptionMessage(
                          "Error during FFT procedure"));

//         scale(TSpectralType(1./std::sqrt(x.getNumRows() * x.getNumColumns())),
//               fft_x, fft_x);
      }
Exemple #5
0
bool test0(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_PaddedData,
    *d_Kernel,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum,
    *d_KernelSpectrum;

    cufftHandle
    fftPlanFwd,
    fftPlanInv;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing built-in R2C / C2R FFT-based convolution\n");
    const int kernelH = 3;
    const int kernelW = 3;
    const int kernelY = 1;
    const int kernelX = 1;
    const int   dataH = 10;
    const int   dataW = 10;
    const int    fftH = snapTransformSize(dataH + kernelH - 1);
    const int    fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        //h_Data[i] = getRand();
		h_Data[i] = i + 1;
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        //h_Kernel[i] = getRand();
		h_Kernel[i] = i + 1;
    }

	FILE* fp2 = fopen("input_kernel.txt", "w+");
	FILE* fp3 = fopen("input_data.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
		fprintf(fp3, "%f\n", h_Data[i]);
	for (int i = 0; i < kernelH * kernelW; i++)
		fprintf(fp2, "%f\n", h_Kernel[i]);
	fclose(fp2);
	fclose(fp3);

    printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
    checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));
    checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum));
    modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1);
    checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU convolution results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftDestroy(fftPlanInv));
    checkCudaErrors(cufftDestroy(fftPlanFwd));

    checkCudaErrors(cudaFree(d_DataSpectrum));
    checkCudaErrors(cudaFree(d_KernelSpectrum));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_Data));
    checkCudaErrors(cudaFree(d_Kernel));

	FILE* fp = fopen("result_gpu.txt", "w+");
	FILE* fp1 = fopen("result_cpu.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
	{
		fprintf(fp, "%f\n", h_ResultGPU[i]);
		fprintf(fp1, "%f\n", h_ResultCPU[i]);
	}
	fclose(fp);
	fclose(fp1);

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Data);
    free(h_Kernel);

    return bRetVal;
}
Exemple #6
0
cufftResult WINAPI wine_cufftExecR2C(cufftHandle plan, cufftReal *idata, cufftComplex *odata){
	WINE_TRACE("\n");
	return cufftExecR2C( plan, idata, odata );
}
void accfft_execute_gpuf(accfft_plan_gpuf* plan, int direction,float * data_d, float * data_out_d, double * timer,std::bitset<3> xyz){

  if(data_d==NULL)
    data_d=plan->data;
  if(data_out_d==NULL)
    data_out_d=plan->data_out;

  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  int NY=plan->N[1];
  float dummy_time=0;


  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecR2C(plan->fplan_0,(cufftReal*)data_d, (cufftComplex*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

    // Perform N0/P0 transpose


    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_out_d, (cufftComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2C(plan->fplan_2,(cufftComplex*)data_d, (cufftComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(float*)data_d,timings,1,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecC2C(plan->fplan_1,(cufftComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }



    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(float*)data_d,timings,1,osize_1i[0],coords[0]);
    }
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    // IFFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecC2R(plan->iplan_0,(cufftComplex*)data_d,(cufftReal*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  MPI_Barrier(plan->c_comm);

  return;
}