예제 #1
0
void
mexFunction( int nlhs, mxArray *plhs[],
             int nrhs, const  mxArray *prhs[] )
{


  if (nrhs!=5)
    mexErrMsgTxt("Wrong number of arguments");


  cufftHandle plan =  (cufftHandle) mxGetScalar(prhs[0]);
  int nx  =  (int) mxGetScalar(prhs[1]);
  int ny  =  (int) mxGetScalar(prhs[2]);
  int nz  =  (int) mxGetScalar(prhs[3]);
  cufftType_t type = (cufftType_t) ((int)mxGetScalar(prhs[4]));

  cufftResult status = cufftPlan3d(&plan, nx, ny, nz, type);

  plhs[0] = mxCreateDoubleScalar(status);
  if (nlhs>1)
    plhs[1] = mxCreateDoubleScalar(plan);



}
예제 #2
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) {
		fprintf(stderr, "CUFFT error: Plan creation failed");
	}

	cufftSetStream(plan, (cudaStream_t) stream);
	cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD);
	cufftDestroy(plan);
}
예제 #3
0
sararfftnd_plan sararfft3d_create_plan(
  int nx, int ny, int nz,
  sarafft_direction dir
) {
#ifdef USE_GPUS
  sararfftnd_plan plan;
  cufftResult result = cufftPlan3d( &plan, nx, ny, nz, dir );
  if( CUFFT_SUCCESS != result )
    exit(64); // TODO better error handling (but to do that, the caller must be rewritten)
  return plan;
#else // #ifndef USE_GPUS
  return rfftw3d_create_plan( nx, ny, nz, dir, FFTW_MEASURE | FFTW_IN_PLACE );
#endif
}
예제 #4
0
파일: JCufft.cpp 프로젝트: caomw/jcufft
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftPlan3dNative
 * Signature: (Ljcuda/jcufft/JCufftHandle;IIII)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan3dNative
  (JNIEnv *env, jclass cla, jobject handle, jint nx, jint ny, jint nz, jint type)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan3d");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Creating 3D plan for (%d, %d, %d) elements of type %d\n", nx, ny, nz, type);

    cufftHandle plan = env->GetIntField(handle, cufftHandle_plan);
    cufftResult result = cufftPlan3d(&plan, nx, ny, nz, getCufftType(type));
    env->SetIntField(handle, cufftHandle_plan, plan);
    return result;
}
예제 #5
0
sararfftnd_plan sararfft3d_create_plan(
  int nx, int ny, int nz, sarafft_direction dir
) {
  sararfftnd_plan plan;
  printf( "cufftPlan3d() about to start!\n" );
  fflush ( stdout );
  cufftResult result = cufftPlan3d( &plan, nx, ny, nz, dir );
  if ( CUFFT_SUCCESS != result ) {
    printf( "cufftPlan3d() failed with code %d for dir=%d\n", result, dir );
    fflush ( stdout );
    exit( 84 ); // TODO better error handling (but to do that, the caller must be rewritten)
  }
  printf( "cufftPlan3d() succeeded!\n" );
  fflush ( stdout );
  setPlanSize ( plan, sizeof( sarafft_real ) * nx * ny * ( nz + 2 ) );
  return plan;
}
예제 #6
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{   //printf("Running 3d forward xform \n");
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan creation failed\n");
	}
	//printf("Built plan \n");

	cufftSetStream(plan, (cudaStream_t) stream);

	if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan execution failed\n");
	};
	cufftDestroy(plan);
}
예제 #7
0
void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils)
{
  if (gpuMemAllocated)
    return;

  gi_host = initAndCopyGpuNUFFTInfo();//

  int			data_count          = (int)this->kSpaceTraj.count();
  IndType imdata_count        = this->imgDims.count();
  int			sector_count        = (int)this->gridSectorDims.count();
    
  if (DEBUG)
    printf("allocate and copy data indices of size %d...\n",dataIndices.count());
  allocateAndCopyToDeviceMem<IndType>(&data_indices_d,dataIndices.data,dataIndices.count());
  
  if (DEBUG)
    printf("allocate and copy data of size %d...\n",data_count);
  allocateDeviceMem<DType2>(&data_sorted_d,data_count);
  
  if (DEBUG)
    printf("allocate and copy gdata of size %d...\n",gi_host->grid_width_dim);
  allocateDeviceMem<CufftType>(&gdata_d,gi_host->grid_width_dim);

  if (DEBUG)
    printf("allocate and copy coords of size %d...\n",getImageDimensionCount()*data_count);
  allocateAndCopyToDeviceMem<DType>(&crds_d,this->kSpaceTraj.data,getImageDimensionCount()*data_count);

  if (DEBUG)
    printf("allocate and copy kernel in const memory of size %d...\n",this->kernel.count());

  initLookupTable();

  //allocateAndCopyToDeviceMem<DType>(&kernel_d,kernel,kernel_count);
  if (DEBUG)
    printf("allocate and copy sectors of size %d...\n",sector_count+1);
  allocateAndCopyToDeviceMem<IndType>(&sectors_d,this->sectorDataCount.data,sector_count+1);

  if (DEBUG)
    printf("allocate and copy sector_centers of size %d...\n",getImageDimensionCount()*sector_count);
  allocateAndCopyToDeviceMem<IndType>(&sector_centers_d,(IndType*)this->getSectorCentersData(),getImageDimensionCount()*sector_count);

  if (this->applyDensComp())	
  {
    if (DEBUG)
      printf("allocate and copy density compensation of size %d...\n",data_count);
    allocateAndCopyToDeviceMem<DType>(&density_comp_d,this->dens.data,data_count);
  }

  if (this->applySensData())	
  {
    if (DEBUG)
      printf("allocate sens data of size %d...\n",imdata_count);
    allocateDeviceMem<DType2>(&sens_d,imdata_count);
  }
  
  if (n_coils > 1)
  {
    if (DEBUG)
      printf("allocate precompute deapofunction of size %d...\n",imdata_count);
    allocateDeviceMem<DType>(&deapo_d,imdata_count);
    precomputeDeapodization(deapo_d,gi_host);
  }
  if (DEBUG)
    printf("sector pad width: %d\n",gi_host->sector_pad_width);

  //Inverse fft plan and execution
  if (DEBUG)
    printf("creating cufft plan with %d,%d,%d dimensions\n",DEFAULT_VALUE(gi_host->gridDims.z),gi_host->gridDims.y,gi_host->gridDims.x);
  cufftResult res = cufftPlan3d(&fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),(int)gi_host->gridDims.y,(int)gi_host->gridDims.x, CufftTransformType) ;
  if (res != CUFFT_SUCCESS) 
    fprintf(stderr,"error on CUFFT Plan creation!!! %d\n",res);
  gpuMemAllocated = true;
}
예제 #8
0
struct fft_plan_3d *fft_3d_create_plan_cuda(
       MPI_Comm comm, int nfast, int nmid, int nslow,
       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
       int in_klo, int in_khi,
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_klo, int out_khi,
       int scaled, int permute, int *nbuf,bool ainit)
{
#ifdef FFT_CUFFT
  struct fft_plan_3d *plan;
  int me,nprocs;
  int i,num,flag,remapflag,fftflag;
  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
  int np1,np2,ip1,ip2;
  int list[50];

  // system specific variables

  // query MPI info

  MPI_Comm_rank(comm,&me);
  MPI_Comm_size(comm,&nprocs);

#ifndef FFT_CUFFT
    error->all(FLERR,"ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'.");
#endif
  // compute division of procs in 2 dimensions not on-processor
  bifactor_cuda(nprocs,&np1,&np2);
  ip1 = me % np1;
  ip2 = me/np1;

  // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited)

  int ip1out = ip1;
  int ip2out = ip2;
  int np1out = np1;
  int np2out = np2;

  ip1 = 0;
  ip2 = 0;
  np1 = 1;
  np2 = 1;

  // allocate memory for plan data struct

  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
  if (plan == NULL) return NULL;
  plan->init=ainit;

  // remap from initial distribution to layout needed for 1st set of 1d FFTs
  // not needed if all procs own entire fast axis initially
  // first indices = distribution after 1st set of FFTs

  if (in_ilo == 0 && in_ihi == nfast-1)
    flag = 0;
  else
    flag = 1;

  if(nprocs>1)flag=1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    first_ilo = in_ilo;
    first_ihi = in_ihi;
    first_jlo = in_jlo;
    first_jhi = in_jhi;
    first_klo = in_klo;
    first_khi = in_khi;
    plan->pre_plan = NULL;
  }
  else {
    first_ilo = 0;
    first_ihi = nfast - 1;
    first_jlo = ip1*nmid/np1;
    first_jhi = (ip1+1)*nmid/np1 - 1;
    first_klo = ip2*nslow/np2;
    first_khi = (ip2+1)*nslow/np2 - 1;
    int members=2;
    if(plan->init) members=1;
    plan->pre_plan =
      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,
                           members,0,0,2,0);
    if (plan->pre_plan == NULL) return NULL;
  }

  // 1d FFTs along fast axis

  plan->length1 = nfast;
  plan->total1 = nfast * nmid * nslow;

  // remap from 1st to 2nd FFT
  // choose which axis is split over np1 vs np2 to minimize communication
  // second indices = distribution after 2nd set of FFTs

  second_ilo = ip1*nfast/np1;
  second_ihi = (ip1+1)*nfast/np1 - 1;
  second_jlo = 0;
  second_jhi = nmid - 1;
  second_klo = ip2*nslow/np2;
  second_khi = (ip2+1)*nslow/np2 - 1;
  plan->mid1_plan =
      remap_3d_create_plan(comm,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_klo,second_khi,
                           2,1,0,2,0);
  if (plan->mid1_plan == NULL) return NULL;

  // 1d FFTs along mid axis

  plan->length2 = nmid;
  plan->total2 = nfast * nmid * nslow;

  // remap from 2nd to 3rd FFT
  // if final distribution is permute=2 with all procs owning entire slow axis
  //   then this remapping goes directly to final distribution
  //  third indices = distribution after 3rd set of FFTs

  flag=1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    third_ilo = out_ilo;
    third_ihi = out_ihi;
    third_jlo = out_jlo;
    third_jhi = out_jhi;
    third_klo = out_klo;
    third_khi = out_khi;
  }
  else {
    third_ilo = ip1*nfast/np1;
    third_ihi = (ip1+1)*nfast/np1 - 1;
    third_jlo = ip2*nmid/np2;
    third_jhi = (ip2+1)*nmid/np2 - 1;
    third_klo = 0;
    third_khi = nslow - 1;
  }

  plan->mid2_plan =
    remap_3d_create_plan(comm,
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,
                         2,1,0,2,0);
  if (plan->mid2_plan == NULL) return NULL;

  // 1d FFTs along slow axis

  plan->length3 = nslow;
  plan->total3 = nfast * nmid * nslow;

  // remap from 3rd FFT to final distribution
  //  not needed if permute = 2 and third indices = out indices on all procs

  flag=1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0)
    plan->post_plan = NULL;
  else {
    plan->post_plan =
      remap_3d_create_plan(comm,
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,
                           2,(permute+1)%3,0,2,0);
    if (plan->post_plan == NULL) return NULL;
  }

  // configure plan memory pointers and allocate work space
  // out_size = amount of memory given to FFT by user
  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
  // copy_size = amount needed internally for extra copy of data
  // scratch_size = amount needed internally for remap scratch space
  // for each remap:
  //   out space used for result if big enough, else require copy buffer
  //   accumulate largest required remap scratch space

  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
    (first_khi-first_klo+1);
  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
    (second_khi-second_klo+1);
  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
    (third_khi-third_klo+1);

  plan->ihi_out=out_ihi;
  plan->ilo_out=out_ilo;
  plan->jhi_out=out_jhi;
  plan->jlo_out=out_jlo;
  plan->khi_out=out_khi;
  plan->klo_out=out_klo;

  copy_size = 0;
  scratch_size = 0;

  if (plan->pre_plan) {
    if (first_size <= out_size)
      plan->pre_target = 0;
    else {
      plan->pre_target = 1;
      copy_size = MAX(copy_size,first_size);
    }
    scratch_size = MAX(scratch_size,first_size);
  }

  if (plan->mid1_plan) {
    if (second_size <= out_size)
      plan->mid1_target = 0;
    else {
      plan->mid1_target = 1;
      copy_size = MAX(copy_size,second_size);
    }
    scratch_size = MAX(scratch_size,second_size);
  }

  if (plan->mid2_plan) {
    if (third_size <= out_size)
      plan->mid2_target = 0;
    else {
      plan->mid2_target = 1;
      copy_size = MAX(copy_size,third_size);
    }
    scratch_size = MAX(scratch_size,third_size);
  }

  if (plan->post_plan)
    scratch_size = MAX(scratch_size,out_size);

  *nbuf = copy_size + scratch_size;

  if (copy_size) {
    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
    if (plan->copy == NULL) return NULL;
  }
  else plan->copy = NULL;

  if (scratch_size) {
    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
    if (plan->scratch == NULL) return NULL;
  }
  else plan->scratch = NULL;

  // system specific pre-computation of 1d FFT coeffs
  // and scaling normalization

  cufftResult retvalc;
  int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) *
    (in_khi-in_klo+1);
  int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
    (out_khi-out_klo+1);

  int nfft_both = MAX(nfft,nfft_brick);
  nfft_both=nfast*nmid*nslow;

  plan->cudatasize=nfft_both*sizeof(FFT_DATA);

  //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast);
  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc);
  plan->nfast=nfast;

  //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid);
  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc);
  plan->nmid=nmid;

  //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow);
  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
  plan->nslow=nslow;

  retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN);
  if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);

  plan->nprocs=nprocs;
  plan->me=me;
  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

  plan->coretime=0;
  plan->iterate=0;
  plan->ffttime=0;
  return plan;
  #endif
}
예제 #9
0
cufftResult WINAPI wine_cufftPlan3d(cufftHandle *plan, int nx, int ny, int nz, cufftType type){
	WINE_TRACE("\n");
	return cufftPlan3d( plan, nx, ny, nz, type );
}
예제 #10
0
/* 3D FFT for field generation */
void make_field_3dfft(int _fielddim)
{
	std::chrono::time_point<std::chrono::high_resolution_clock> time1, time2, time3;
	time1 = std::chrono::system_clock::now();

	double PLindex;	
	//	int nr_3Dfields, fielddim, index, mod_fielddim;	
	int neg_k,neg_i,neg_j;

	cufftDoubleComplex *host_array;
	cufftDoubleComplex *device_array;
	cufftHandle back_plan;	

	PLindex = atof(test_fftw[1]);	
	int fielddim = _fielddim;//atoi(test_fftw[2]);
	int nr_3Dfields = atoi(test_fftw[3]);
	int mod_fielddim = fielddim+1;

	size_t nr_points = (fielddim+1)*(fielddim+1)*(fielddim+1);
	size_t num_bytes = nr_points * sizeof(cufftDoubleComplex);

	// mem allocation
	host_array = (cufftDoubleComplex*) malloc(num_bytes);
	//array_in = (fftw_complex*) fftw_malloc(nr_points * sizeof(fftw_complex));
	checkCudaErrors(cudaMalloc((void **)&device_array, num_bytes));

	//back_plan = fftw_plan_dft_3d(mod_fielddim, mod_fielddim, mod_fielddim,array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE);
	cufftPlan3d(&back_plan, mod_fielddim, mod_fielddim, mod_fielddim, CUFFT_Z2Z);
	//array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE);

	cout << endl <<"fiel_dim " << mod_fielddim << endl;
	
	for(int m=0; m<nr_3Dfields; m++){	
		cout << endl <<"field #" << m << endl;		

		for(int i=-fielddim/2;i<=fielddim/2;i++){	//fill the data that goes into the FFT 
			for(int j=-fielddim/2;j<=fielddim/2;j++){
				for(int k=0;k<=fielddim/2;k++){
					//positive k parts 
					double abs_k = sqrt((double)(i*i+j*j+k*k));
					double A = 1000.0*pow(abs_k,(PLindex*0.5));
					double phase=(2.0*rand_01()-1.0)*PI;
					int index = (i+fielddim/2)*mod_fielddim*mod_fielddim+(j+fielddim/2)*mod_fielddim+k+fielddim/2;

					//array_in[index]=A*cos(phase)+I*A*sin(phase);
					host_array[index].x = A*cos(phase);
					host_array[index].y = A*sin(phase);

					//negative k parts
					//phase=-1.0*phase;
					neg_i =- 1*i;
					neg_j =- 1*j;
					neg_k =- 1*k;
					index = (neg_i+fielddim/2)*mod_fielddim*mod_fielddim+(neg_j+fielddim/2)*mod_fielddim+neg_k+fielddim/2;

					//array_in[index]=A*cos(phase)+I*A*sin(-1.0*phase);
					host_array[index].x = A*cos(phase);
					host_array[index].y = A*sin(-1.0*phase);
				}
			}
		}

		host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].x = 1.0;
		host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].y = 0.0;

		//Rearange array for use in FFTW library 
		for(int j=0;j<=nr_points/2;j++){
			cufftDoubleComplex temp = host_array[j];
			host_array[j] = host_array[j+nr_points/2];
			host_array[j+nr_points/2] = temp;
		}

		time2 = std::chrono::system_clock::now();

		// move data to the GPU
		cudaMemcpy(device_array, host_array, num_bytes, cudaMemcpyDeviceToHost);

		///	fftw_execute(back_plan);
		cufftExecZ2Z(back_plan, device_array, device_array, FFTW_BACKWARD);

		// move back
		cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyHostToDevice);

		time3 = std::chrono::system_clock::now();

		auto s1 = std::chrono::duration_cast<std::chrono::milliseconds>(time2-time1).count();
		auto s2 = std::chrono::duration_cast<std::chrono::milliseconds>(time3-time2).count();
		auto time_total = s1 + s2;	

		cout << endl << 
			"--- 3D FFT make_field ---" << endl
			<< "1.        " << s1 << " ms" << endl
			<< "2.        " << s2 << " ms" << endl				
			<< "---" << endl
			<< "total time             " << time_total << " ms" << endl
			<< "---" << endl;


		//for(i=0;i<nr_points;i++) printf("%g + i*%g\n",creal(array_in[i]),cimag(array_in[i]));	

		/*		for(int i=0;i<mod_fielddim;i++){
		for(int j=0;j<mod_fielddim;j++){
		for(int k=0;k<mod_fielddim;k++){
		printf("%.15lf\n", host_array[i*mod_fielddim*mod_fielddim+j*mod_fielddim+k].x);
		}
		}
		}
		*/

	}

	// print some elemntes for debug
	double *print_array = (double*) host_array;
	print_double(print_array, 100); // first 100 elements
	print_double(print_array+(2*nr_points-100), 100); // last 100 elements


	/*
	fftw_free(array_in);	
	fftw_destroy_plan(back_plan);	
	*/
	cufftDestroy(back_plan);
	cudaFree(device_array);
	free(host_array);
}