Ejemplo n.º 1
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftExecZ2ZNative
 * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;I)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecZ2ZNative
  (JNIEnv *env, jclass cla, jobject handle, jobject cIdata, jobject cOdata, jint direction)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecZ2Z");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cIdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cIdata' is null for cufftExecZ2Z");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (cOdata == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecZ2Z");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Executing cufftExecZ2Z\n");

    cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan);
    cufftDoubleComplex* nativeCIData = (cufftDoubleComplex*)getPointer(env, cIdata);
    cufftDoubleComplex* nativeCOData = (cufftDoubleComplex*)getPointer(env, cOdata);

    cufftResult result = cufftExecZ2Z(nativePlan, nativeCIData, nativeCOData, direction);
    return result;
}
Ejemplo n.º 2
0
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) 
{ 
	cufftHandle plan; 
	cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); 
	cufftSetStream(plan, (cudaStream_t)stream); 
	cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); 
	cufftDestroy(plan); 
} 
Ejemplo n.º 3
0
/*
 * Function to be called in thread managing host operations and invoking kernels
 */
void* host_thread(void* passing_ptr) {
    DataArray* data_arr_ptr = (DataArray*) passing_ptr;

    alloc_data_host(data_arr_ptr);
    printf("data allocated by host thread\n");

    //printf("data filling by host thread\n");
    for (uint64_t ii = 0; ii < data_arr_ptr->size; ii++) {
        (*(data_arr_ptr->data_r))[ii] = ii;
        (*(data_arr_ptr->data_k))[ii] = data_arr_ptr->size-ii;
    }
    printf("data filled by host thread\n");

    // synchronize after allocating memory - streams should be created, mem on device ready for copying
    pthread_barrier_wait (&barrier);
    printf("1st barier host thread - allocating mem on cpu\n");





    //  here we can make cufft plan, for example
    cufftHandle plan_forward;
    cufftPlan1d(&plan_forward, N, CUFFT_Z2Z, 1);



    // synchornize after ... - data should be copyied on device
    pthread_barrier_wait (&barrier);
    printf("2nd barier host thread - \n");


    // run some computations
    cufftExecZ2Z(plan_forward, *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_k_dev), CUFFT_FORWARD);
    printf("cufft done\n");

    // synchornize after computations -

    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("3rd barier host thread - \n");



    // synchornize after computations -
    pthread_barrier_wait (&barrier);
    printf("4th barier host thread - \n");

    printf("data visible in host thread:\n");
    /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf("%lu.\t",ii);
      printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] ));
      printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] ));
    }*/

    printf("closing host thread\n");
    pthread_exit(NULL);
}
Ejemplo n.º 4
0
void oskar_fft_exec(oskar_FFT* h, oskar_Mem* data, int* status)
{
    oskar_Mem *data_copy = 0, *data_ptr = data;
    if (oskar_mem_location(data) != h->location)
    {
        data_copy = oskar_mem_create_copy(data, h->location, status);
        data_ptr = data_copy;
    }
    if (h->location == OSKAR_CPU)
    {
        if (h->num_dim == 1)
        {
            *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
        }
        else if (h->num_dim == 2)
        {
            if (h->precision == OSKAR_DOUBLE)
                oskar_fftpack_cfft2f(h->dim_size, h->dim_size, h->dim_size,
                        oskar_mem_double(data_ptr, status),
                        oskar_mem_double(h->fftpack_wsave, status),
                        oskar_mem_double(h->fftpack_work, status));
            else
                oskar_fftpack_cfft2f_f(h->dim_size, h->dim_size, h->dim_size,
                        oskar_mem_float(data_ptr, status),
                        oskar_mem_float(h->fftpack_wsave, status),
                        oskar_mem_float(h->fftpack_work, status));
            /* This step not needed for W-kernel generation, so turn it off. */
            if (h->ensure_consistent_norm)
                oskar_mem_scale_real(data_ptr, (double)h->num_cells_total,
                        0, h->num_cells_total, status);
        }
    }
    else if (h->location == OSKAR_GPU)
    {
#ifdef OSKAR_HAVE_CUDA
        if (h->precision == OSKAR_DOUBLE)
            cufftExecZ2Z(h->cufft_plan,
                    (cufftDoubleComplex*) oskar_mem_void(data_ptr),
                    (cufftDoubleComplex*) oskar_mem_void(data_ptr),
                    CUFFT_FORWARD);
        else
            cufftExecC2C(h->cufft_plan,
                    (cufftComplex*) oskar_mem_void(data_ptr),
                    (cufftComplex*) oskar_mem_void(data_ptr),
                    CUFFT_FORWARD);
#endif
    }
    else
        *status = OSKAR_ERR_BAD_LOCATION;
    if (oskar_mem_location(data) != h->location)
        oskar_mem_copy(data, data_ptr, status);
    oskar_mem_free(data_copy, status);
}
Ejemplo n.º 5
0
      static cufftResult_t centerdifft(const std::complex<double>* in_mat, std::complex<double>* out_mat,
                                       cufftHandle* fftplan)
      {
        cufftResult_t cufftResult;

        cufftResult = cufftExecZ2Z(*fftplan,
                     (cufftDoubleComplex*)in_mat,
                     (cufftDoubleComplex*)out_mat,
                     CUFFT_INVERSE);
      
        AGILE_ASSERT(result == CUFFT_SUCCESS,
                        StandardException::ExceptionMessage(
                          "Error during FFT procedure"));

        return cufftResult;
      }
Ejemplo n.º 6
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{   //printf("Running 3d forward xform \n");
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan creation failed\n");
	}
	//printf("Built plan \n");

	cufftSetStream(plan, (cudaStream_t) stream);

	if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan execution failed\n");
	};
	cufftDestroy(plan);
}
Ejemplo n.º 7
0
 void single(char *dst, char *const *src)
 {
   cufftExecZ2Z(plan, reinterpret_cast<src_type *>(src[0]), reinterpret_cast<dst_type *>(dst), sign);
 }
Ejemplo n.º 8
0
void oskar_imager_finalise_plane(oskar_Imager* h, oskar_Mem* plane,
        double plane_norm, int* status)
{
    int size, num_cells;
    DeviceData* d;
    if (*status) return;

    /* Apply normalisation. */
    if (plane_norm > 0.0 || plane_norm < 0.0)
        oskar_mem_scale_real(plane, 1.0 / plane_norm, status);
    if (h->algorithm == OSKAR_ALGORITHM_DFT_2D ||
            h->algorithm == OSKAR_ALGORITHM_DFT_3D)
        return;

    /* Check plane is complex type, as plane must be gridded visibilities. */
    if (!oskar_mem_is_complex(plane))
    {
        *status = OSKAR_ERR_TYPE_MISMATCH;
        return;
    }

    /* Make image using FFT and apply grid correction. */
    size = h->grid_size;
    num_cells = size * size;
    d = &h->d[0];
    if (oskar_mem_precision(plane) == OSKAR_DOUBLE)
    {
        oskar_fftphase_cd(size, size, oskar_mem_double(plane, status));
        if (h->fft_on_gpu)
        {
#ifdef OSKAR_HAVE_CUDA
            oskar_device_set(h->cuda_device_ids[0], status);
            oskar_mem_copy(d->plane_gpu, plane, status);
            cufftExecZ2Z(h->cufft_plan, oskar_mem_void(d->plane_gpu),
                    oskar_mem_void(d->plane_gpu), CUFFT_FORWARD);
            oskar_mem_copy(plane, d->plane_gpu, status);
#else
            *status = OSKAR_ERR_CUDA_NOT_AVAILABLE;
#endif
        }
        else
        {
            oskar_fftpack_cfft2f(size, size, size,
                    oskar_mem_double(plane, status),
                    oskar_mem_double(h->fftpack_wsave, status),
                    oskar_mem_double(h->fftpack_work, status));
            oskar_mem_scale_real(plane, (double)num_cells, status);
        }
        oskar_fftphase_cd(size, size, oskar_mem_double(plane, status));
        oskar_grid_correction_d(size, oskar_mem_double(h->corr_func, status),
                oskar_mem_double(plane, status));
    }
    else
    {
        oskar_fftphase_cf(size, size, oskar_mem_float(plane, status));
        if (h->fft_on_gpu)
        {
#ifdef OSKAR_HAVE_CUDA
            oskar_device_set(h->cuda_device_ids[0], status);
            oskar_mem_copy(d->plane_gpu, plane, status);
            cufftExecC2C(h->cufft_plan, oskar_mem_void(d->plane_gpu),
                    oskar_mem_void(d->plane_gpu), CUFFT_FORWARD);
            oskar_mem_copy(plane, d->plane_gpu, status);
#else
            *status = OSKAR_ERR_CUDA_NOT_AVAILABLE;
#endif
        }
        else
        {
            oskar_fftpack_cfft2f_f(size, size, size,
                    oskar_mem_float(plane, status),
                    oskar_mem_float(h->fftpack_wsave, status),
                    oskar_mem_float(h->fftpack_work, status));
            oskar_mem_scale_real(plane, (double)num_cells, status);
        }
        oskar_fftphase_cf(size, size, oskar_mem_float(plane, status));
        oskar_grid_correction_f(size, oskar_mem_double(h->corr_func, status),
                oskar_mem_float(plane, status));
    }
}
Ejemplo n.º 9
0
/**
 * Execute C2C plan. This function is blocking and only returns after the transform is completed.
 * @note For inplace transforms, data_out should point to the same memory address as data, AND
 * the plan must have been created as inplace.
 * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2c.
 * @param data Input data in frequency domain.
 * @param data_out Output data in frequency domain.
 * @param timer See \ref timer for more details.
 * @param XYZ a bit set field that determines which directions FFT should be executed
 */
void accfft_execute_c2c_gpu(accfft_plan_gpu* plan, int direction,Complex * data_d, Complex * data_out_d, double * timer,std::bitset<3> xyz){

  if(!plan->c2c_plan_baked){
    if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl;
    return;
  }

  if(data_d==NULL)
    data_d=plan->data_c;
  if(data_out_d==NULL)
    data_out_d=plan->data_out_c;
  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  int NY=plan->N[1];
  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  cufftResult_t cufft_error;
  float dummy_time=0;

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2,osize_0[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    MPI_Barrier(plan->c_comm);


    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft (cudaDeviceSynchronize());
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }


    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1,osize_1i[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  MPI_Barrier(plan->c_comm);

  return;
}// end accfft_execute_c2c_gpu
Ejemplo n.º 10
0
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){

  if(data_d==NULL)
    data_d=plan->data;
  if(data_out_d==NULL)
    data_out_d=plan->data_out;

  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  int NY=plan->N[1];
  float dummy_time=0;


  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;
    // Perform N0/P0 transpose


    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    MPI_Barrier(plan->c_comm);
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]);
    }
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }



    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    // IFFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  checkCuda_accfft (cudaDeviceSynchronize());
  MPI_Barrier(plan->c_comm);

  return;
}// end accfft_execute_gpu
Ejemplo n.º 11
0
cufftResult WINAPI wine_cufftExecZ2Z(cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata, int direction){
	WINE_TRACE("\n");
	return cufftExecZ2Z( plan, idata, odata, direction );
}
Ejemplo n.º 12
0
/* 3D FFT for field generation */
void make_field_3dfft(int _fielddim)
{
	std::chrono::time_point<std::chrono::high_resolution_clock> time1, time2, time3;
	time1 = std::chrono::system_clock::now();

	double PLindex;	
	//	int nr_3Dfields, fielddim, index, mod_fielddim;	
	int neg_k,neg_i,neg_j;

	cufftDoubleComplex *host_array;
	cufftDoubleComplex *device_array;
	cufftHandle back_plan;	

	PLindex = atof(test_fftw[1]);	
	int fielddim = _fielddim;//atoi(test_fftw[2]);
	int nr_3Dfields = atoi(test_fftw[3]);
	int mod_fielddim = fielddim+1;

	size_t nr_points = (fielddim+1)*(fielddim+1)*(fielddim+1);
	size_t num_bytes = nr_points * sizeof(cufftDoubleComplex);

	// mem allocation
	host_array = (cufftDoubleComplex*) malloc(num_bytes);
	//array_in = (fftw_complex*) fftw_malloc(nr_points * sizeof(fftw_complex));
	checkCudaErrors(cudaMalloc((void **)&device_array, num_bytes));

	//back_plan = fftw_plan_dft_3d(mod_fielddim, mod_fielddim, mod_fielddim,array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE);
	cufftPlan3d(&back_plan, mod_fielddim, mod_fielddim, mod_fielddim, CUFFT_Z2Z);
	//array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE);

	cout << endl <<"fiel_dim " << mod_fielddim << endl;
	
	for(int m=0; m<nr_3Dfields; m++){	
		cout << endl <<"field #" << m << endl;		

		for(int i=-fielddim/2;i<=fielddim/2;i++){	//fill the data that goes into the FFT 
			for(int j=-fielddim/2;j<=fielddim/2;j++){
				for(int k=0;k<=fielddim/2;k++){
					//positive k parts 
					double abs_k = sqrt((double)(i*i+j*j+k*k));
					double A = 1000.0*pow(abs_k,(PLindex*0.5));
					double phase=(2.0*rand_01()-1.0)*PI;
					int index = (i+fielddim/2)*mod_fielddim*mod_fielddim+(j+fielddim/2)*mod_fielddim+k+fielddim/2;

					//array_in[index]=A*cos(phase)+I*A*sin(phase);
					host_array[index].x = A*cos(phase);
					host_array[index].y = A*sin(phase);

					//negative k parts
					//phase=-1.0*phase;
					neg_i =- 1*i;
					neg_j =- 1*j;
					neg_k =- 1*k;
					index = (neg_i+fielddim/2)*mod_fielddim*mod_fielddim+(neg_j+fielddim/2)*mod_fielddim+neg_k+fielddim/2;

					//array_in[index]=A*cos(phase)+I*A*sin(-1.0*phase);
					host_array[index].x = A*cos(phase);
					host_array[index].y = A*sin(-1.0*phase);
				}
			}
		}

		host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].x = 1.0;
		host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].y = 0.0;

		//Rearange array for use in FFTW library 
		for(int j=0;j<=nr_points/2;j++){
			cufftDoubleComplex temp = host_array[j];
			host_array[j] = host_array[j+nr_points/2];
			host_array[j+nr_points/2] = temp;
		}

		time2 = std::chrono::system_clock::now();

		// move data to the GPU
		cudaMemcpy(device_array, host_array, num_bytes, cudaMemcpyDeviceToHost);

		///	fftw_execute(back_plan);
		cufftExecZ2Z(back_plan, device_array, device_array, FFTW_BACKWARD);

		// move back
		cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyHostToDevice);

		time3 = std::chrono::system_clock::now();

		auto s1 = std::chrono::duration_cast<std::chrono::milliseconds>(time2-time1).count();
		auto s2 = std::chrono::duration_cast<std::chrono::milliseconds>(time3-time2).count();
		auto time_total = s1 + s2;	

		cout << endl << 
			"--- 3D FFT make_field ---" << endl
			<< "1.        " << s1 << " ms" << endl
			<< "2.        " << s2 << " ms" << endl				
			<< "---" << endl
			<< "total time             " << time_total << " ms" << endl
			<< "---" << endl;


		//for(i=0;i<nr_points;i++) printf("%g + i*%g\n",creal(array_in[i]),cimag(array_in[i]));	

		/*		for(int i=0;i<mod_fielddim;i++){
		for(int j=0;j<mod_fielddim;j++){
		for(int k=0;k<mod_fielddim;k++){
		printf("%.15lf\n", host_array[i*mod_fielddim*mod_fielddim+j*mod_fielddim+k].x);
		}
		}
		}
		*/

	}

	// print some elemntes for debug
	double *print_array = (double*) host_array;
	print_double(print_array, 100); // first 100 elements
	print_double(print_array+(2*nr_points-100), 100); // last 100 elements


	/*
	fftw_free(array_in);	
	fftw_destroy_plan(back_plan);	
	*/
	cufftDestroy(back_plan);
	cudaFree(device_array);
	free(host_array);
}