struct fft_plan_3d *fft_3d_create_plan (MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int flag,remapflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1=0,np2=0,ip1,ip2; MPI_Comm_rank(comm, &me); MPI_Comm_size(comm, &nprocs); bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me / np1; /* allocate memory for plan data struct */ plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; /* remap from initial distribution to layout needed for 1st set of 1d FFTs not needed if all procs own entire fast axis initially first indices = distribution after 1st set of FFTs */ if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, FFT_PRECISION,0,0,2); if (plan->pre_plan == NULL) return NULL; } /* 1d FFTs along fast axis */ plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); /* remap from 1st to 2nd FFT choose which axis is split over np1 vs np2 to minimize communication second indices = distribution after 2nd set of FFTs */ second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, FFT_PRECISION,1,0,2); if (plan->mid1_plan == NULL) return NULL; /* 1d FFTs along mid axis */ plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); /* remap from 2nd to 3rd FFT if final distribution is permute=2 with all procs owning entire slow axis then this remapping goes directly to final distribution third indices = distribution after 3rd set of FFTs */ if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, FFT_PRECISION,1,0,2); if (plan->mid2_plan == NULL) return NULL; /* 1d FFTs along slow axis */ plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; /* remap from 3rd FFT to final distribution not needed if permute = 2 and third indices = out indices on all procs */ if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, FFT_PRECISION,(permute+1)%3,0,2); if (plan->post_plan == NULL) return NULL; } /* configure plan memory pointers and allocate work space out_size = amount of memory given to FFT by user first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps copy_size = amount needed internally for extra copy of data scratch_size = amount needed internally for remap scratch space for each remap: use out space for result if big enough, else require copy buffer accumulate largest required remap scratch space */ out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } return plan; }
struct fft_plan_3d *fft_3d_create_plan( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; // system specific variables #ifdef FFT_SCSL FFT_DATA dummy_d[5]; FFT_PREC dummy_p[5]; int isign,isys; FFT_PREC scalef; #endif #ifdef FFT_INTEL FFT_DATA dummy; #endif #ifdef FFT_T3E FFT_DATA dummy[5]; int isign,isys; double scalef; #endif // query MPI info MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); // compute division of procs in 2 dimensions not on-processor bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; // allocate memory for plan data struct plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; // remap from initial distribution to layout needed for 1st set of 1d FFTs // not needed if all procs own entire fast axis initially // first indices = distribution after 1st set of FFTs if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,2,0,0,FFT_PRECISION); if (plan->pre_plan == NULL) return NULL; } // 1d FFTs along fast axis plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); // remap from 1st to 2nd FFT // choose which axis is split over np1 vs np2 to minimize communication // second indices = distribution after 2nd set of FFTs second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi,2,1,0,FFT_PRECISION); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); // remap from 2nd to 3rd FFT // if final distribution is permute=2 with all procs owning entire slow axis // then this remapping goes directly to final distribution // third indices = distribution after 3rd set of FFTs if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi,2,1,0,FFT_PRECISION); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; // remap from 3rd FFT to final distribution // not needed if permute = 2 and third indices = out indices on all procs if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION); if (plan->post_plan == NULL) return NULL; } // configure plan memory pointers and allocate work space // out_size = amount of memory given to FFT by user // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps // copy_size = amount needed internally for extra copy of data // scratch_size = amount needed internally for remap scratch space // for each remap: // out space used for result if big enough, else require copy buffer // accumulate largest required remap scratch space out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; // system specific pre-computation of 1d FFT coeffs // and scaling normalization #if defined(FFT_SGI) plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; FFT_1D_INIT(nfast,plan->coeff1); FFT_1D_INIT(nmid,plan->coeff2); FFT_1D_INIT(nslow,plan->coeff3); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_SCSL) plan->coeff1 = (FFT_PREC *) malloc((2*nfast+30)*sizeof(FFT_PREC)); plan->coeff2 = (FFT_PREC *) malloc((2*nmid+30)*sizeof(FFT_PREC)); plan->coeff3 = (FFT_PREC *) malloc((2*nslow+30)*sizeof(FFT_PREC)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (FFT_PREC *) malloc((2*nfast)*sizeof(FFT_PREC)); plan->work2 = (FFT_PREC *) malloc((2*nmid)*sizeof(FFT_PREC)); plan->work3 = (FFT_PREC *) malloc((2*nslow)*sizeof(FFT_PREC)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(isign,nfast,scalef,dummy_d,dummy_d,plan->coeff1,dummy_p,&isys); FFT_1D_INIT(isign,nmid,scalef,dummy_d,dummy_d,plan->coeff2,dummy_p,&isys); FFT_1D_INIT(isign,nslow,scalef,dummy_d,dummy_d,plan->coeff3,dummy_p,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_ACML) plan->coeff1 = (FFT_DATA *) malloc((3*nfast+100)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid+100)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow+100)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; int isign = 100; int isys = 1; int info = 0; FFT_DATA *dummy = NULL; FFT_1D(&isign,&isys,&nfast,dummy,plan->coeff1,&info); FFT_1D(&isign,&isys,&nmid,dummy,plan->coeff2,&info); FFT_1D(&isign,&isys,&nslow,dummy,plan->coeff3,&info); if (scaled == 0) { plan->scaled = 0; plan->norm = sqrt(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else { plan->scaled = 1; plan->norm = sqrt(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_INTEL) flag = 0; num = 0; factor(nfast,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nmid,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nslow,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm); if (fftflag) { if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n"); return NULL; } plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; flag = 0; FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1); FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2); FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3); if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #elif defined(FFT_MKL) DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nfast); DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total1/nfast); DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast); DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast); DftiCommitDescriptor(plan->handle_fast); DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nmid); DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total2/nmid); DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid); DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid); DftiCommitDescriptor(plan->handle_mid); DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nslow); DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total3/nslow); DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow); DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow); DftiCommitDescriptor(plan->handle_slow); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_DEC) if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #elif defined(FFT_T3E) plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double)); plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double)); plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (double *) malloc((8*nfast)*sizeof(double)); plan->work2 = (double *) malloc((8*nmid)*sizeof(double)); plan->work3 = (double *) malloc((8*nslow)*sizeof(double)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys); FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys); FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_FFTW2) plan->plan_fast_forward = fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_fast_backward = fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); if (nmid == nfast) { plan->plan_mid_forward = plan->plan_fast_forward; plan->plan_mid_backward = plan->plan_fast_backward; } else { plan->plan_mid_forward = fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_mid_backward = fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (nslow == nfast) { plan->plan_slow_forward = plan->plan_fast_forward; plan->plan_slow_backward = plan->plan_fast_backward; } else if (nslow == nmid) { plan->plan_slow_forward = plan->plan_mid_forward; plan->plan_slow_backward = plan->plan_mid_backward; } else { plan->plan_slow_forward = fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_slow_backward = fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_FFTW3) plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_mid_forward = FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_slow_forward = FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_BACKWARD,FFTW_ESTIMATE); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #else plan->cfg_fast_forward = kiss_fft_alloc(nfast,0,NULL,NULL); plan->cfg_fast_backward = kiss_fft_alloc(nfast,1,NULL,NULL); if (nmid == nfast) { plan->cfg_mid_forward = plan->cfg_fast_forward; plan->cfg_mid_backward = plan->cfg_fast_backward; } else { plan->cfg_mid_forward = kiss_fft_alloc(nmid,0,NULL,NULL); plan->cfg_mid_backward = kiss_fft_alloc(nmid,1,NULL,NULL); } if (nslow == nfast) { plan->cfg_slow_forward = plan->cfg_fast_forward; plan->cfg_slow_backward = plan->cfg_fast_backward; } else if (nslow == nmid) { plan->cfg_slow_forward = plan->cfg_mid_forward; plan->cfg_slow_backward = plan->cfg_mid_backward; } else { plan->cfg_slow_forward = kiss_fft_alloc(nslow,0,NULL,NULL); plan->cfg_slow_backward = kiss_fft_alloc(nslow,1,NULL,NULL); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif return plan; }
struct fft_plan_3d *fft_3d_create_plan_cuda( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf,bool ainit) { #ifdef FFT_CUFFT struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; // system specific variables // query MPI info MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); #ifndef FFT_CUFFT error->all(FLERR,"ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'."); #endif // compute division of procs in 2 dimensions not on-processor bifactor_cuda(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited) int ip1out = ip1; int ip2out = ip2; int np1out = np1; int np2out = np2; ip1 = 0; ip2 = 0; np1 = 1; np2 = 1; // allocate memory for plan data struct plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; plan->init=ainit; // remap from initial distribution to layout needed for 1st set of 1d FFTs // not needed if all procs own entire fast axis initially // first indices = distribution after 1st set of FFTs if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; if(nprocs>1)flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; int members=2; if(plan->init) members=1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, members,0,0,2,0); if (plan->pre_plan == NULL) return NULL; } // 1d FFTs along fast axis plan->length1 = nfast; plan->total1 = nfast * nmid * nslow; // remap from 1st to 2nd FFT // choose which axis is split over np1 vs np2 to minimize communication // second indices = distribution after 2nd set of FFTs second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, 2,1,0,2,0); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis plan->length2 = nmid; plan->total2 = nfast * nmid * nslow; // remap from 2nd to 3rd FFT // if final distribution is permute=2 with all procs owning entire slow axis // then this remapping goes directly to final distribution // third indices = distribution after 3rd set of FFTs flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, 2,1,0,2,0); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis plan->length3 = nslow; plan->total3 = nfast * nmid * nslow; // remap from 3rd FFT to final distribution // not needed if permute = 2 and third indices = out indices on all procs flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, 2,(permute+1)%3,0,2,0); if (plan->post_plan == NULL) return NULL; } // configure plan memory pointers and allocate work space // out_size = amount of memory given to FFT by user // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps // copy_size = amount needed internally for extra copy of data // scratch_size = amount needed internally for remap scratch space // for each remap: // out space used for result if big enough, else require copy buffer // accumulate largest required remap scratch space out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); plan->ihi_out=out_ihi; plan->ilo_out=out_ilo; plan->jhi_out=out_jhi; plan->jlo_out=out_jlo; plan->khi_out=out_khi; plan->klo_out=out_klo; copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; // system specific pre-computation of 1d FFT coeffs // and scaling normalization cufftResult retvalc; int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) * (in_khi-in_klo+1); int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); int nfft_both = MAX(nfft,nfft_brick); nfft_both=nfast*nmid*nslow; plan->cudatasize=nfft_both*sizeof(FFT_DATA); //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc); plan->nfast=nfast; //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc); plan->nmid=nmid; //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); plan->nslow=nslow; retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN); if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); plan->nprocs=nprocs; plan->me=me; if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } plan->coretime=0; plan->iterate=0; plan->ffttime=0; return plan; #endif }
struct fft_plan_3d *fft_3d_create_plan( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; /* system specific variables */ #ifdef FFT_INTEL FFT_DATA dummy; #endif #ifdef FFT_T3E FFT_DATA dummy[5]; int isign,isys; double scalef; #endif /* query MPI info */ MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); /* compute division of procs in 2 dimensions not on-processor */ bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; /* allocate memory for plan data struct */ plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; /* remap from initial distribution to layout needed for 1st set of 1d FFTs not needed if all procs own entire fast axis initially first indices = distribution after 1st set of FFTs */ if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, FFT_PRECISION,0,0,2); if (plan->pre_plan == NULL) return NULL; } /* 1d FFTs along fast axis */ plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); /* remap from 1st to 2nd FFT choose which axis is split over np1 vs np2 to minimize communication second indices = distribution after 2nd set of FFTs */ second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, FFT_PRECISION,1,0,2); if (plan->mid1_plan == NULL) return NULL; /* 1d FFTs along mid axis */ plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); /* remap from 2nd to 3rd FFT if final distribution is permute=2 with all procs owning entire slow axis then this remapping goes directly to final distribution third indices = distribution after 3rd set of FFTs */ if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, FFT_PRECISION,1,0,2); if (plan->mid2_plan == NULL) return NULL; /* 1d FFTs along slow axis */ plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; /* remap from 3rd FFT to final distribution not needed if permute = 2 and third indices = out indices on all procs */ if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, FFT_PRECISION,(permute+1)%3,0,2); if (plan->post_plan == NULL) return NULL; } /* configure plan memory pointers and allocate work space out_size = amount of memory given to FFT by user first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps copy_size = amount needed internally for extra copy of data scratch_size = amount needed internally for remap scratch space for each remap: use out space for result if big enough, else require copy buffer accumulate largest required remap scratch space */ out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; /* system specific pre-computation of 1d FFT coeffs and scaling normalization */ #ifdef FFT_SGI plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; FFT_1D_INIT(nfast,plan->coeff1); FFT_1D_INIT(nmid,plan->coeff2); FFT_1D_INIT(nslow,plan->coeff3); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif #ifdef FFT_INTEL flag = 0; num = 0; factor(nfast,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nmid,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nslow,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm); if (fftflag) { if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n"); return NULL; } plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; flag = 0; FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1); FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2); FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3); if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #endif #ifdef FFT_DEC if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #endif #ifdef FFT_T3E plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double)); plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double)); plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (double *) malloc((8*nfast)*sizeof(double)); plan->work2 = (double *) malloc((8*nmid)*sizeof(double)); plan->work3 = (double *) malloc((8*nslow)*sizeof(double)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys); FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys); FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif #ifdef FFT_FFTW plan->plan_fast_forward = fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_fast_backward = fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); if (nmid == nfast) { plan->plan_mid_forward = plan->plan_fast_forward; plan->plan_mid_backward = plan->plan_fast_backward; } else { plan->plan_mid_forward = fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_mid_backward = fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (nslow == nfast) { plan->plan_slow_forward = plan->plan_fast_forward; plan->plan_slow_backward = plan->plan_fast_backward; } else if (nslow == nmid) { plan->plan_slow_forward = plan->plan_mid_forward; plan->plan_slow_backward = plan->plan_mid_backward; } else { plan->plan_slow_forward = fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_slow_backward = fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif return plan; }