SO3_trans::SO3_trans(const Config & configSettings):Data(configSettings) { workspace_cx =(fftw_complex**)malloc(sizeof(fftw_complex*)*NUM_THREADS); workspace_cx2=(fftw_complex**)malloc(sizeof(fftw_complex*)*NUM_THREADS); workspace_re = (double **)malloc(sizeof(double*)*NUM_THREADS); for(int i = 0; i<NUM_THREADS; i++) { workspace_cx[i] = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * n3); workspace_cx2[i] = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * n3); workspace_re[i] = (double *)malloc(sizeof(double)*(24*bw + 2*bw*bw)); } wignerSpace = ((bw*bw)*(2+3*bw+bw*bw))/3 ; wigners = (double *)malloc(sizeof(double) * wignerSpace); wignersTrans = (double *)malloc(sizeof(double) * wignerSpace); weights = (double *) malloc(sizeof(double) * (2*bw)); genWigAll( bw, wigners, workspace_re[0] ); genWigAllTrans( bw, wignersTrans, workspace_re[0]); makeweights2( bw, weights ); { int na[2], inembed[2], onembed[2]; int rank, howmany, istride, idist, ostride, odist; howmany = n*n; idist = n; odist = n; rank = 2 ; inembed[0] = n; inembed[1] = n*n; onembed[0] = n; onembed[1] = n*n; istride = 1; ostride = 1; na[0] = 1; na[1] = n; p1 = fftw_plan_many_dft( rank, na, howmany, workspace_cx2[0], inembed, istride, idist, workspace_cx[0], onembed, ostride, odist, FFTW_BACKWARD, FFTW_MEASURE ); //FFTW_BACKWARD, FFTW_PATIENT); p2 = fftw_plan_many_dft( rank, na, howmany, workspace_cx[0], inembed, istride, idist, workspace_cx2[0], onembed, ostride, odist, FFTW_FORWARD, FFTW_MEASURE ); //FFTW_FORWARD, FFTW_PATIENT); } }
PetscErrorCode MatApply_USFFT_Private(Mat A, fftw_plan *plan, int direction, Vec x,Vec y) { #if 0 PetscErrorCode ierr; PetscScalar *r_array, *y_array; Mat_USFFT* = (Mat_USFFT*)(A->data); #endif PetscFunctionBegin; #if 0 /* resample x to usfft->resample */ ierr = MatResample_USFFT_Private(A, x);CHKERRQ(ierr); /* NB: for now we use outdim for both x and y; this will change once a full USFFT is implemented */ ierr = VecGetArray(usfft->resample,&r_array);CHKERRQ(ierr); ierr = VecGetArray(y,&y_array);CHKERRQ(ierr); if (!*plan) { /* create a plan then execute it*/ if (usfft->dof == 1) { #if defined(PETSC_DEBUG_USFFT) ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "direction = %d, usfft->ndim = %d\n", direction, usfft->ndim);CHKERRQ(ierr); for (int ii = 0; ii < usfft->ndim; ++ii) { ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "usfft->outdim[%d] = %d\n", ii, usfft->outdim[ii]);CHKERRQ(ierr); } #endif switch (usfft->dim) { case 1: *plan = fftw_plan_dft_1d(usfft->outdim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; case 2: *plan = fftw_plan_dft_2d(usfft->outdim[0],usfft->outdim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; case 3: *plan = fftw_plan_dft_3d(usfft->outdim[0],usfft->outdim[1],usfft->outdim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; default: *plan = fftw_plan_dft(usfft->ndim,usfft->outdim,(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; } fftw_execute(*plan); } /* if (dof == 1) */ else { /* if (dof > 1) */ *plan = fftw_plan_many_dft(/*rank*/usfft->ndim, /*n*/usfft->outdim, /*howmany*/usfft->dof, (fftw_complex*)x_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1, (fftw_complex*)y_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1, /*sign*/direction, /*flags*/usfft->p_flag); fftw_execute(*plan); } /* if (dof > 1) */ } /* if (!*plan) */ else { /* if (*plan) */ /* use existing plan */ fftw_execute_dft(*plan,(fftw_complex*)x_array,(fftw_complex*)y_array); } ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr); ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr); #endif PetscFunctionReturn(0); } /* MatApply_USFFT_Private() */
void make_fftw_plans(int size, ft_data *ftd){ int ncmp; int rank, howmany, istride, idist, ostride, odist; int n[1], inembed[1], onembed[1]; int nxfm; unsigned flags; int dir; double dtime = start_timing(); flags = FFTW_ESTIMATE; /* Could try FFTW_MEASURE */ rank = 1; /* Number of complex values in a 4D site datum */ ncmp = size/sizeof(complex); idist = odist = 1; for(dir = 0; dir < NDIM; dir++) if(layout[dir] != NULL){ nxfm = layout[dir]->nxfm; /* The FT dimension */ n[0] = inembed[0] = onembed[0] = nxfm; /* Number of contiguous complex values per 1D coordinate being transformed */ howmany = (sites_on_node*ncmp)/nxfm; ostride = istride = howmany; fwd_plan[dir] = fftw_plan_many_dft(rank, n, howmany, ftd->data, inembed, istride, idist, ftd->tmp, onembed, ostride, odist, FFTW_FORWARD, flags); bck_plan[dir] = fftw_plan_many_dft(rank, n, howmany, ftd->data, inembed, istride, idist, ftd->tmp, onembed, ostride, odist, FFTW_BACKWARD, flags); } print_timing(dtime, "make FFTW plans"); }
void ifft(int N, double *in, int stride) { int i; fftw_plan p; p = fftw_plan_many_dft(1,&N, 1, (fftw_complex *)in, NULL, stride, 0, (fftw_complex *)in, NULL, stride, 0, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_execute(p); fftw_destroy_plan(p); for (i=0; i<N; i++) { in[i*2*stride] /= N+0.0; in[i*2*stride+1] /= N+0.0; } }
THREADABLE_FUNCTION_6ARG(fft4d, complex*,out, complex*,in, int*,ext_dirs, int,ncpp, double,sign, int,normalize) { GET_THREAD_ID(); //first of all put in to out if(out!=in) vector_copy(out,in); //list all dirs int dirs[NDIM],ndirs=0; for(int mu=0;mu<NDIM;mu++) if(ext_dirs[mu]) dirs[ndirs++]=mu; verbosity_lv2_master_printf("Going to FFT: %d dimensions in total\n",ndirs); if(ndirs) { //allocate buffer complex *buf=nissa_malloc("buf",max_locd_size*ncpp,complex); //allocate plans fftw_plan *plans=nissa_malloc("plans",ndirs,fftw_plan); if(IS_MASTER_THREAD) for(int idir=0;idir<ndirs;idir++) plans[idir]=fftw_plan_many_dft(1,glb_size+dirs[idir],ncpp,buf,NULL,ncpp,1,buf,NULL,ncpp,1,sign,FFTW_ESTIMATE); THREAD_BARRIER(); //transpose each dir in turn and take fft for(int idir=0;idir<ndirs;idir++) { int mu=dirs[idir]; verbosity_lv2_master_printf("FFT-ing dimension %d/%d=%d\n",idir+1,ndirs,mu); remap_lx_vector_to_locd(buf,out,ncpp*sizeof(complex),mu); //makes all the fourier transform NISSA_PARALLEL_LOOP(ioff,0,locd_perp_size_per_dir[mu]) fftw_execute_dft(plans[idir],buf+ioff*glb_size[mu]*ncpp,buf+ioff*glb_size[mu]*ncpp); THREAD_BARRIER(); remap_locd_vector_to_lx(out,buf,ncpp*sizeof(complex),mu); } //destroy plans if(IS_MASTER_THREAD) for(int idir=0;idir<ndirs;idir++) fftw_destroy_plan(plans[idir]); //put normaliisation if(normalize) { double norm=glb_size[dirs[0]]; for(int idir=1;idir<ndirs;idir++) norm*=glb_size[idir]; double_vector_prod_double((double*)out,(double*)out,1/norm,2*ncpp*loc_vol); } nissa_free(buf); nissa_free(plans); } }
void eigencoeffs(T *x, uint_t n, const double *tapers, const double *lambda, uint_t K, bool remove_mean, uint_t N, fftw_complex *Jkx){ T mx=tmath::mean<T>(x,n); //computes standard mean double tmp; //temporary variable T wmx; //weighted mean const int nsize=N; // Remove weighted averages, reducing bias from non-centred data //(forces eigenspectra to have zero DC component) for ( uint_t ii=0; ii<K; ii++){ if (remove_mean==true){ tmp=tmath::sum<double>(&tapers[ii*n],n); //tmp should be near zero for odd tapers (but due to round-off may not be exactly zero) if ( tmath::abs<double>(tmp) > ZERO_TOL ){ wmx=tmath::dot_mult<T,double>(x,&tapers[ii*n],n); wmx=wmx/tmp; } else { // for odd DPSS sequences, the weighted average is zero // However, we remove the regular mean for good measure wmx=mx; } } else { wmx=0; } for (uint_t jj=0; jj<n; jj++){ Jkx[ii*N+jj]=x[jj]-wmx; //prepares for in-place FFT } } //Window the data for ( uint_t ii=0; ii<K; ii++){ tmath::pw_mult<fftw_complex,double>(&Jkx[ii*N], &tapers[ii*n], n, &Jkx[ii*N] ); //zero-pad rest of array for (uint_t jj=n; jj<N; jj++){ Jkx[ii*N+jj]=0; } } // COMPUTE EIGENSPECTRA // computes all K ffts in one step fftw_load(); //potentially load fftw3 fftw_plan px = fftw_plan_many_dft(1, &nsize, K , Jkx, NULL, 1, N, Jkx, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(px); fftw_destroy_plan(px); }
bool do_ifft_1d_c2r(int M, int N, float* out, float* in) { /* if (num_threads>1) { fftw_init_threads(); fftw_plan_with_nthreads(num_threads); } */ int MN = M * N; fftw_complex* in2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * MN); fftw_complex* out2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * MN); for (int ii = 0; ii < MN; ii++) { in2[ii][0] = in[ii * 2]; in2[ii][1] = in[ii * 2 + 1]; } fftw_plan p; int rank = 1; int n[] = { N }; int howmany = M; int* inembed = n; int istride = M; int idist = 1; int* onembed = n; int ostride = M; int odist = 1; int sign = FFTW_BACKWARD; unsigned flags = FFTW_ESTIMATE; #pragma omp critical p = fftw_plan_many_dft(rank, n, howmany, in2, inembed, istride, idist, out2, onembed, ostride, odist, sign, flags); //p=fftw_plan_dft_1d(N,in2,out2,FFTW_BACKWARD,FFTW_ESTIMATE); fftw_execute(p); for (int ii = 0; ii < MN; ii++) { out[ii] = out2[ii][0]; } fftw_free(in2); fftw_free(out2); /* if (num_threads>1) { fftw_cleanup_threads(); } */ #pragma omp critical fftw_destroy_plan(p); return true; }
/** * fft makes an 1D-ftt for every knot through * all layers */ static void fft(int N,int M,int Z, fftw_complex *mem) { fftw_plan plan; plan = fftw_plan_many_dft(1, &Z, N*N, mem, NULL, N*N, 1, mem, NULL, N*N,1 , FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(plan); /* execute the fft */ fftw_destroy_plan(plan); }
fftw_plan spinor_fftw_plan2d(spinor *spinor_in,spinor *spinor_out,int dim0,int dim1,int howmany_wospin,unsigned int forward,int fftw_flags){ /* int index_s = gsi(get_index(it, ix, iy, iz, T, L)); */ /* double *xi_ = xi + index_s; */ int Dim1[2]; /* cerr << "Trying to create a plan for T=" << T << " L=" << L ; */ /* cerr.flush(); */ int rank=2; int stride=12*howmany_wospin; int dist=1; int howmany=12*howmany_wospin; fftw_plan plan; Dim1[0]=dim0; Dim1[1]=dim1; if(fftw_flags==-1){fftw_flags=FFTW_ESTIMATE;} if(forward){ plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, (fftw_complex*)spinor_out,NULL,stride,dist, FFTW_FORWARD,fftw_flags); } else { plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, (fftw_complex*)spinor_out,NULL,stride,dist, FFTW_BACKWARD,fftw_flags); } /* if(plan!=NULL) cerr << " [OK]"<< endl; */ /* else cerr << " [FAIL]"<< endl; */ /* cerr.flush(); */ return plan; }
Fourier::Fourier(int n) { const char* fname = "void Fourier::Initialize()"; VRB.Debug(fname, "Allocating memory and creating plans for FFTW."); batch_size = n; #ifdef USE_SINGLE b = (fftComplex*) fftwf_malloc(batch_size*GJP.Vol()*sizeof(fftComplex)); #endif #ifdef USE_DOUBLE b = (fftComplex*) fftw_malloc(batch_size*GJP.Vol()*sizeof(fftComplex)); #endif #ifdef USE_LONG_DOUBLE b = (fftComplex*) fftwl_malloc(batch_size*GJP.Vol()*sizeof(fftComplex)); #endif // Below needs to be adjusted for batch ffts; double check in place int vol = GJP.Vol(); int dims[3] = { GJP.Xsites(), GJP.Ysites(), GJP.Zsites() }; #ifdef USE_SINGLE p1 = fftwf_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_BACKWARD, FFTW_EXHAUSTIVE); p2 = fftwf_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_FORWARD, FFTW_EXHAUSTIVE); #endif #ifdef USE_DOUBLE p1 = fftw_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_BACKWARD, FFTW_EXHAUSTIVE); p2 = fftw_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_FORWARD, FFTW_EXHAUSTIVE); #endif #ifdef USE_LONG_DOUBLE p1 = fftwl_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_BACKWARD, FFTW_EXHAUSTIVE); p2 = fftwl_plan_many_dft(3, dims, batch_size, b, NULL, 1, vol, b, NULL, 1, vol, FFTW_FORWARD, FFTW_EXHAUSTIVE); #endif }
int main(int argc, char **argv) { fftw_complex *mem; fftw_plan plan; int N,M,Z; if (argc <= 6) { printf("usage: ./reconstruct_data_gridding FILENAME N M Z ITER WEIGHTS\n"); return 1; } N=atoi(argv[2]); M=atoi(argv[3]); Z=atoi(argv[4]); /* Allocate memory to hold every slice in memory after the 2D-infft */ mem = (fftw_complex*) nfft_malloc(sizeof(fftw_complex) * atoi(argv[2]) * atoi(argv[2]) * atoi(argv[4])); /* Create plan for the 1d-ifft */ plan = fftw_plan_many_dft(1, &Z, N*N, mem, NULL, N*N, 1, mem, NULL, N*N,1 , FFTW_BACKWARD, FFTW_MEASURE); /* execute the 2d-nfft's */ reconstruct(argv[1],atoi(argv[2]),atoi(argv[3]),atoi(argv[4]),atoi(argv[6]),mem); /* execute the 1d-fft's */ fftw_execute(plan); /* write the memory back in files */ print(N,M,Z, mem); /* free memory */ nfft_free(mem); return 1; }
FFT_DATA *_fwd(cow_dfield *f, double *fx, int start, int stride) { FFT_DATA *Fk = NULL; FFT_DATA *Fx = NULL; if (cow_mpirunning()) { #if (COW_MPI) int nbuf; long long ntot = cow_domain_getnumglobalzones(f->domain, COW_ALL_DIMS); struct fft_plan_3d *plan = call_fft_plan_3d(f->domain, &nbuf); Fx = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); Fk = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); for (int n=0; n<nbuf; ++n) { Fx[n][0] = fx[stride * n + start] / ntot; Fx[n][1] = 0.0; } fft_3d(Fx, Fk, FFT_FWD, plan); free(Fx); fft_3d_destroy_plan(plan); #endif // COW_MPI } else { int nbuf = cow_domain_getnumlocalzonesinterior(f->domain, COW_ALL_DIMS); long long ntot = cow_domain_getnumglobalzones(f->domain, COW_ALL_DIMS); Fx = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); Fk = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); for (int n=0; n<nbuf; ++n) { Fx[n][0] = fx[stride * n + start] / ntot; Fx[n][1] = 0.0; } int *N = f->domain->L_nint; fftw_plan plan = fftw_plan_many_dft(3, N, 1, Fx, NULL, 1, 0, Fk, NULL, 1, 0, FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); free(Fx); } return Fk; }
double *_rev(cow_domain *d, FFT_DATA *Fk) { FFT_DATA *Fx = NULL; double *fx = NULL; if (cow_mpirunning()) { #if (COW_MPI) int nbuf; long long ntot = cow_domain_getnumglobalzones(d, COW_ALL_DIMS); struct fft_plan_3d *plan = call_fft_plan_3d(d, &nbuf); fx = (double*) malloc(nbuf * sizeof(double)); Fx = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); fft_3d(Fk, Fx, FFT_REV, plan); for (int n=0; n<nbuf; ++n) { fx[n] = Fx[n][0] / ntot; } free(Fx); fft_3d_destroy_plan(plan); #endif // COW_MPI } else { int nbuf = cow_domain_getnumlocalzonesinterior(d, COW_ALL_DIMS); long long ntot = cow_domain_getnumglobalzones(d, COW_ALL_DIMS); fx = (double*) malloc(nbuf * sizeof(double)); Fx = (FFT_DATA*) malloc(nbuf * sizeof(FFT_DATA)); int *N = d->L_nint; fftw_plan plan = fftw_plan_many_dft(3, N, 1, Fk, NULL, 1, 0, Fx, NULL, 1, 0, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_execute(plan); for (int n=0; n<nbuf; ++n) { fx[n] = Fx[n][0] / ntot; } free(Fx); fftw_destroy_plan(plan); } return fx; }
fftw_plan nl_createplan (lua_State *L, nl_Matrix *m, int inverse, unsigned flags, lua_Number *scale) { fftw_plan plan; int i; nl_Buffer *dim = nl_getbuffer(L, m->ndims); for (i = 0; i < m->ndims; i++) /* reverse dims */ dim->data.bint[i] = m->dim[m->ndims - 1 - i]; *scale = 1.0 / m->size; if (m->iscomplex) { /* fft plan? */ /* in-place, howmany == 1, dist ignored, nembed == n */ plan = fftw_plan_many_dft(m->ndims, (const int *) dim->data.bint, 1, (fftw_complex *) m->data, NULL, m->stride, 0, (fftw_complex *) m->data, NULL, m->stride, 0, inverse ? FFTW_BACKWARD : FFTW_FORWARD, flags); } else { /* fct plan? */ nl_Buffer *kind = nl_getbuffer(L, m->ndims); if (inverse) { for (i = 0; i < m->ndims; i++) { kind->data.bint[i] = FFTW_REDFT01; *scale *= 0.5; } } else { for (i = 0; i < m->ndims; i++) kind->data.bint[i] = FFTW_REDFT10; } /* in-place, howmany == 1, dist ignored, nembed == n */ plan = fftw_plan_many_r2r(m->ndims, (const int *) dim->data.bint, 1, m->data, NULL, m->stride, 0, m->data, NULL, m->stride, 0, (const fftw_r2r_kind *) kind->data.bint, flags); nl_freebuffer(kind); } nl_freebuffer(dim); return plan; }
/** * @brief Complex DFT and inverse DFT calculation wrapper * * Compute the DFT or inverse DFT of the complex matrix X of size (nvar x nobs) * into the complex matrix Y of size (nvar x nobs) depending on the sign of sign * parameter. * * @param[out] Y Output matrix of complex numbers (double[2]) of size (nvar x nobs) * @param[in] X Input matrix of complex numbers (double[2]) of size (nvar x nobs) [can be the same as Y] * @param[in] nvar Number of variables (rows) within X and Y * @param[in] nobs Number of observations (columns) within X and Y * @param[in] sign If -1 computes the DFT, if 1 computes the inverse DFT of X * * @return Pointer to Y or NULL if DFT fails */ double *_fft(double *Y, const double *X, const unsigned long nvar, const unsigned long nobs, int sign) { const int n = nvar; unsigned long i,nelem; fftw_plan plan = fftw_plan_many_dft(1, // [int rank] Rank 1 DFT &n, // [const int *n] Number of variables within input array nobs, // [int howmany] Number of observations (number of DFT to perform) (fftw_complex *) X, // [fftw_complex *in] Input array is X (it is cast to non-const but will not be modified since FFTW_DESTROY_INPUT is not set) NULL, // [const int *inembed] Distance between each rank in input array (Not used since rank=1) 1, // [int istride] Distance between successive variables in input array (in unit of fftw_complex) nvar, // [int idist] Distance between 2 observations in input array (in unit of fftw_complex) (fftw_complex *) Y, // [fftw_complex *out] Output array is Y NULL, // [const int *onembed] Distance between each rank in output array (Not used since rank=1) 1, // [int ostride] Distance between successive variables in output array (in unit of fftw_complex) nvar, // [int odist] Distance between 2 observations in output array (in unit of fftw_complex) sign, // sign of the exponent in the formula that defines the Fourier transform (-1 or +1) FFTW_ESTIMATE); // [unsigned flags] Quickly choose a plan without performing full benchmarks (maybe sub-optimal but take less time) // If plan building fails, quit if(!plan) return NULL; // Execute FFTW plan fftw_execute(plan); // If we compute the inverse transform (sign == 1), normalize result by nvar (FFTW compute unnormalized transform) if(sign == 1) { nelem = 2 * nvar * nobs; for(i = 0; i < nelem; i++) Y[i] /= nvar; } // Destroy FFTW plan fftw_destroy_plan(plan); return Y; }
int main(int argc, char *argv[]) { int ret = EXIT_FAILURE; // Set up the PRNG dsfmt_t *dsfmt = malloc(sizeof(dsfmt_t)); if(dsfmt == NULL) { fprintf(stdout, "unable to allocate PRNG\n"); goto skip_deallocate_prng; } dsfmt_init_gen_rand(dsfmt, SEED); // Set up the source values double *src = fftw_malloc(N*VL*sizeof(double)); if(src == NULL) { fprintf(stdout, "unable to allocate source vector\n"); goto skip_deallocate_src; } for(unsigned int i = 0; i < N*VL; ++i) { src[i] = dsfmt_genrand_open_close(dsfmt); } // Allocate the FFT destination array double complex *fft = fftw_malloc(N*VL*sizeof(double complex)); if(fft == NULL) { fprintf(stdout, "unable to allocate fft vector\n"); goto skip_deallocate_fft; } // Execute the forward FFT fftw_plan fwd_plan = fftw_plan_many_dft_r2c(1, &N, VL, src, NULL, VL, 1, fft, NULL, VL, 1, FFTW_ESTIMATE); if(fwd_plan == NULL) { fprintf(stdout, "unable to allocate fft forward plan\n"); goto skip_deallocate_fwd_plan; } fftw_execute(fwd_plan); // Fill in the rest of the destination values using the Hermitian property. fft_r2c_1d_vec_finish(fft, N, VL); // Allocate the reverse FFT destination array double complex *dst = fftw_malloc(N*VL*sizeof(double complex)); if(dst == NULL) { fprintf(stdout, "unable to allocate dst vector\n"); goto skip_deallocate_dst; } // Perform the reverse FFT fftw_plan rev_plan = fftw_plan_many_dft(1, &N, VL, fft, NULL, VL, 1, dst, NULL, VL, 1, FFTW_BACKWARD, FFTW_ESTIMATE); if(rev_plan == NULL) { fprintf(stdout, "unable to allocate fft reverse plan\n"); goto skip_deallocate_rev_plan; } fftw_execute(rev_plan); // Compare the two vectors by sup norm double norm = 0.0; for(unsigned int i = 0; i < N*VL; ++i) { // Divide the resulting by N, because FFTW computes the un-normalized DFT: // the forward followed by reverse transform scales the data by N. norm = fmax(norm, cabs(dst[i]/N - src[i])); } if(norm <= 1e-6) { ret = EXIT_SUCCESS; } fftw_destroy_plan(rev_plan); skip_deallocate_rev_plan: fftw_free(dst); skip_deallocate_dst: fftw_destroy_plan(fwd_plan); skip_deallocate_fwd_plan: fftw_free(fft); skip_deallocate_fft: fftw_free(src); skip_deallocate_src: free(dsfmt); skip_deallocate_prng: // Keep valgrind happy by having fftw clean up its internal structures. This // helps ensure we aren't leaking memory. fftw_cleanup(); return ret; }
void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan) { int i,total,length,num; double norm; FFT_DATA *data,*copy; /* pre-remap to prepare for 1st FFTs if needed copy = loc for remap result */ if (plan->pre_plan) { if (plan->pre_target == 0) copy = out; else copy = plan->copy; remap_3d((double *) in, (double *) copy, (double *) plan->scratch, plan->pre_plan); data = copy; } else data = in; // --------------------------------------------------------------------------- // 1d FFTs along mid axis // --------------------------------------------------------------------------- total = plan->total1; length = plan->length1; { int sign = flag == +1 ? FFTW_FORWARD : FFTW_BACKWARD; int N = length; fftw_plan fftplan = fftw_plan_many_dft(1, &N, total/length, data, NULL, 1, length, data, NULL, 1, length, sign, FFTW_ESTIMATE); fftw_execute(fftplan); fftw_destroy_plan(fftplan); } /* 1st mid-remap to prepare for 2nd FFTs copy = loc for remap result */ if (plan->mid1_target == 0) copy = out; else copy = plan->copy; remap_3d((double *) data, (double *) copy, (double *) plan->scratch, plan->mid1_plan); data = copy; // --------------------------------------------------------------------------- // 1d FFTs along mid axis // --------------------------------------------------------------------------- total = plan->total2; length = plan->length2; { int sign = flag == +1 ? FFTW_FORWARD : FFTW_BACKWARD; int N = length; fftw_plan fftplan = fftw_plan_many_dft(1, &N, total/length, data, NULL, 1, length, data, NULL, 1, length, sign, FFTW_ESTIMATE); fftw_execute(fftplan); fftw_destroy_plan(fftplan); } /* 2nd mid-remap to prepare for 3rd FFTs copy = loc for remap result */ if (plan->mid2_target == 0) copy = out; else copy = plan->copy; remap_3d((double *) data, (double *) copy, (double *) plan->scratch, plan->mid2_plan); data = copy; // --------------------------------------------------------------------------- // 1d FFTs along slow axis // --------------------------------------------------------------------------- total = plan->total3; length = plan->length3; { int sign = flag == +1 ? FFTW_FORWARD : FFTW_BACKWARD; int N = length; fftw_plan fftplan = fftw_plan_many_dft(1, &N, total/length, data, NULL, 1, length, data, NULL, 1, length, sign, FFTW_ESTIMATE); fftw_execute(fftplan); fftw_destroy_plan(fftplan); } /* post-remap to put data in output format if needed destination is always out */ if (plan->post_plan) remap_3d((double *) data, (double *) out, (double *) plan->scratch, plan->post_plan); /* scaling if required */ if (flag == -1 && plan->scaled) { norm = plan->norm; num = plan->normnum; for (i = 0; i < num; i++) { out[i][0] *= norm; out[i][1] *= norm; } } }
int dfft_init(double **data, int *local_mesh_dim, int *local_mesh_margin, int* global_mesh_dim, double *global_mesh_off, int *ks_pnum) { int i,j; /* helpers */ int mult[3]; int n_grid[4][3]; /* The four node grids. */ int my_pos[4][3]; /* The position of this_node in the node grids. */ int *n_id[4]; /* linear node identity lists for the node grids. */ int *n_pos[4]; /* positions of nodes in the node grids. */ /* FFTW WISDOM stuff. */ char wisdom_file_name[255]; FILE *wisdom_file; int wisdom_status; FFT_TRACE(fprintf(stderr,"%d: dipolar dfft_init():\n",this_node)); dfft.max_comm_size=0; dfft.max_mesh_size=0; for(i=0;i<4;i++) { n_id[i] = (int *) malloc(1*n_nodes*sizeof(int)); n_pos[i] = (int *) malloc(3*n_nodes*sizeof(int)); } /* === node grids === */ /* real space node grid (n_grid[0]) */ for(i=0;i<3;i++) { n_grid[0][i] = node_grid[i]; my_pos[0][i] = node_pos[i]; } for(i=0;i<n_nodes;i++) { map_node_array(i,&(n_pos[0][3*i+0])); n_id[0][get_linear_index( n_pos[0][3*i+0],n_pos[0][3*i+1],n_pos[0][3*i+2], n_grid[0])] = i; } /* FFT node grids (n_grid[1 - 3]) */ calc_2d_grid(n_nodes,n_grid[1]); /* resort n_grid[1] dimensions if necessary */ dfft.plan[1].row_dir = map_3don2d_grid(n_grid[0], n_grid[1], mult); dfft.plan[0].n_permute = 0; for(i=1;i<4;i++) dfft.plan[i].n_permute = (dfft.plan[1].row_dir+i)%3; for(i=0;i<3;i++) { n_grid[2][i] = n_grid[1][(i+1)%3]; n_grid[3][i] = n_grid[1][(i+2)%3]; } dfft.plan[2].row_dir = (dfft.plan[1].row_dir-1)%3; dfft.plan[3].row_dir = (dfft.plan[1].row_dir-2)%3; /* === communication groups === */ /* copy local mesh off real space charge assignment grid */ for(i=0;i<3;i++) dfft.plan[0].new_mesh[i] = local_mesh_dim[i]; for(i=1; i<4;i++) { dfft.plan[i].g_size=fft_find_comm_groups(n_grid[i-1], n_grid[i], n_id[i-1], n_id[i], dfft.plan[i].group, n_pos[i], my_pos[i]); if(dfft.plan[i].g_size==-1) { /* try permutation */ j = n_grid[i][(dfft.plan[i].row_dir+1)%3]; n_grid[i][(dfft.plan[i].row_dir+1)%3] = n_grid[i][(dfft.plan[i].row_dir+2)%3]; n_grid[i][(dfft.plan[i].row_dir+2)%3] = j; dfft.plan[i].g_size=fft_find_comm_groups(n_grid[i-1], n_grid[i], n_id[i-1], n_id[i], dfft.plan[i].group, n_pos[i], my_pos[i]); if(dfft.plan[i].g_size==-1) { fprintf(stderr,"%d: dipolar INTERNAL ERROR: fft_find_comm_groups error\n", this_node); errexit(); } } dfft.plan[i].send_block = (int *)realloc(dfft.plan[i].send_block, 6*dfft.plan[i].g_size*sizeof(int)); dfft.plan[i].send_size = (int *)realloc(dfft.plan[i].send_size, 1*dfft.plan[i].g_size*sizeof(int)); dfft.plan[i].recv_block = (int *)realloc(dfft.plan[i].recv_block, 6*dfft.plan[i].g_size*sizeof(int)); dfft.plan[i].recv_size = (int *)realloc(dfft.plan[i].recv_size, 1*dfft.plan[i].g_size*sizeof(int)); dfft.plan[i].new_size = fft_calc_local_mesh(my_pos[i], n_grid[i], global_mesh_dim, global_mesh_off, dfft.plan[i].new_mesh, dfft.plan[i].start); permute_ifield(dfft.plan[i].new_mesh,3,-(dfft.plan[i].n_permute)); permute_ifield(dfft.plan[i].start,3,-(dfft.plan[i].n_permute)); dfft.plan[i].n_ffts = dfft.plan[i].new_mesh[0]*dfft.plan[i].new_mesh[1]; /* === send/recv block specifications === */ for(j=0; j<dfft.plan[i].g_size; j++) { int k, node; /* send block: this_node to comm-group-node i (identity: node) */ node = dfft.plan[i].group[j]; dfft.plan[i].send_size[j] = fft_calc_send_block(my_pos[i-1], n_grid[i-1], &(n_pos[i][3*node]), n_grid[i], global_mesh_dim, global_mesh_off, &(dfft.plan[i].send_block[6*j])); permute_ifield(&(dfft.plan[i].send_block[6*j]),3,-(dfft.plan[i-1].n_permute)); permute_ifield(&(dfft.plan[i].send_block[6*j+3]),3,-(dfft.plan[i-1].n_permute)); if(dfft.plan[i].send_size[j] > dfft.max_comm_size) dfft.max_comm_size = dfft.plan[i].send_size[j]; /* First plan send blocks have to be adjusted, since the CA grid may have an additional margin outside the actual domain of the node */ if(i==1) { for(k=0;k<3;k++) dfft.plan[1].send_block[6*j+k ] += local_mesh_margin[2*k]; } /* recv block: this_node from comm-group-node i (identity: node) */ dfft.plan[i].recv_size[j] = fft_calc_send_block(my_pos[i], n_grid[i], &(n_pos[i-1][3*node]), n_grid[i-1], global_mesh_dim, global_mesh_off,&(dfft.plan[i].recv_block[6*j])); permute_ifield(&(dfft.plan[i].recv_block[6*j]),3,-(dfft.plan[i].n_permute)); permute_ifield(&(dfft.plan[i].recv_block[6*j+3]),3,-(dfft.plan[i].n_permute)); if(dfft.plan[i].recv_size[j] > dfft.max_comm_size) dfft.max_comm_size = dfft.plan[i].recv_size[j]; } for(j=0;j<3;j++) dfft.plan[i].old_mesh[j] = dfft.plan[i-1].new_mesh[j]; if(i==1) dfft.plan[i].element = 1; else { dfft.plan[i].element = 2; for(j=0; j<dfft.plan[i].g_size; j++) { dfft.plan[i].send_size[j] *= 2; dfft.plan[i].recv_size[j] *= 2; } } /* DEBUG */ for(j=0;j<n_nodes;j++) { /* MPI_Barrier(comm_cart); */ if(j==this_node) FFT_TRACE(fft_print_fft_plan(dfft.plan[i])); } } /* Factor 2 for complex fields */ dfft.max_comm_size *= 2; dfft.max_mesh_size = (local_mesh_dim[0]*local_mesh_dim[1]*local_mesh_dim[2]); for(i=1;i<4;i++) if(2*dfft.plan[i].new_size > dfft.max_mesh_size) dfft.max_mesh_size = 2*dfft.plan[i].new_size; FFT_TRACE(fprintf(stderr,"%d: dfft.max_comm_size = %d, dfft.max_mesh_size = %d\n", this_node,dfft.max_comm_size,dfft.max_mesh_size)); /* === pack function === */ for(i=1;i<4;i++) { dfft.plan[i].pack_function = fft_pack_block_permute2; FFT_TRACE(fprintf(stderr,"%d: forw plan[%d] permute 2 \n",this_node,i)); } (*ks_pnum)=6; if(dfft.plan[1].row_dir==2) { dfft.plan[1].pack_function = fft_pack_block; FFT_TRACE(fprintf(stderr,"%d: forw plan[%d] permute 0 \n",this_node,1)); (*ks_pnum)=4; } else if(dfft.plan[1].row_dir==1) { dfft.plan[1].pack_function = fft_pack_block_permute1; FFT_TRACE(fprintf(stderr,"%d: forw plan[%d] permute 1 \n",this_node,1)); (*ks_pnum)=5; } /* Factor 2 for complex numbers */ dfft.send_buf = (double *)realloc(dfft.send_buf, dfft.max_comm_size*sizeof(double)); dfft.recv_buf = (double *)realloc(dfft.recv_buf, dfft.max_comm_size*sizeof(double)); (*data) = (double *)realloc((*data), dfft.max_mesh_size*sizeof(double)); dfft.data_buf = (double *)realloc(dfft.data_buf, dfft.max_mesh_size*sizeof(double)); if(!(*data) || !dfft.data_buf || !dfft.recv_buf || !dfft.send_buf) { fprintf(stderr,"%d: Could not allocate FFT data arays\n",this_node); errexit(); } fftw_complex *c_data = (fftw_complex *) (*data); /* === FFT Routines (Using FFTW / RFFTW package)=== */ for(i=1;i<4;i++) { dfft.plan[i].dir = FFTW_FORWARD; /* FFT plan creation. Attention: destroys contents of c_data/data and c_data_buf/data_buf. */ wisdom_status = FFTW_FAILURE; sprintf(wisdom_file_name,"dfftw3_1d_wisdom_forw_n%d.file", dfft.plan[i].new_mesh[2]); if( (wisdom_file=fopen(wisdom_file_name,"r"))!=NULL ) { wisdom_status = fftw_import_wisdom_from_file(wisdom_file); fclose(wisdom_file); } if(dfft.init_tag==1) fftw_destroy_plan(dfft.plan[i].our_fftw_plan); //printf("dfft.plan[%d].n_ffts=%d\n",i,dfft.plan[i].n_ffts); dfft.plan[i].our_fftw_plan = fftw_plan_many_dft(1,&dfft.plan[i].new_mesh[2],dfft.plan[i].n_ffts, c_data,NULL,1,dfft.plan[i].new_mesh[2], c_data,NULL,1,dfft.plan[i].new_mesh[2], dfft.plan[i].dir,FFTW_PATIENT); if( wisdom_status == FFTW_FAILURE && (wisdom_file=fopen(wisdom_file_name,"w"))!=NULL ) { fftw_export_wisdom_to_file(wisdom_file); fclose(wisdom_file); } dfft.plan[i].fft_function = fftw_execute; } /* === The BACK Direction === */ /* this is needed because slightly different functions are used */ for(i=1;i<4;i++) { dfft.back[i].dir = FFTW_BACKWARD; wisdom_status = FFTW_FAILURE; sprintf(wisdom_file_name,"dfftw3_1d_wisdom_back_n%d.file", dfft.plan[i].new_mesh[2]); if( (wisdom_file=fopen(wisdom_file_name,"r"))!=NULL ) { wisdom_status = fftw_import_wisdom_from_file(wisdom_file); fclose(wisdom_file); } if(dfft.init_tag==1) fftw_destroy_plan(dfft.back[i].our_fftw_plan); dfft.back[i].our_fftw_plan = fftw_plan_many_dft(1,&dfft.plan[i].new_mesh[2],dfft.plan[i].n_ffts, c_data,NULL,1,dfft.plan[i].new_mesh[2], c_data,NULL,1,dfft.plan[i].new_mesh[2], dfft.back[i].dir,FFTW_PATIENT); if( wisdom_status == FFTW_FAILURE && (wisdom_file=fopen(wisdom_file_name,"w"))!=NULL ) { fftw_export_wisdom_to_file(wisdom_file); fclose(wisdom_file); } dfft.back[i].fft_function = fftw_execute; dfft.back[i].pack_function = fft_pack_block_permute1; FFT_TRACE(fprintf(stderr,"%d: back plan[%d] permute 1 \n",this_node,i)); } if(dfft.plan[1].row_dir==2) { dfft.back[1].pack_function = fft_pack_block; FFT_TRACE(fprintf(stderr,"%d: back plan[%d] permute 0 \n",this_node,1)); } else if(dfft.plan[1].row_dir==1) { dfft.back[1].pack_function = fft_pack_block_permute2; FFT_TRACE(fprintf(stderr,"%d: back plan[%d] permute 2 \n",this_node,1)); } dfft.init_tag=1; /* free(data); */ for(i=0;i<4;i++) { free(n_id[i]); free(n_pos[i]); } return dfft.max_mesh_size; }
Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter, const bool expand, ConvolveBatchKind kind) { const af::dim4 sd = signal.dims(); const af::dim4 fd = filter.dims(); dim_t fftScale = 1; af::dim4 packed_dims; int fft_dims[baseDim]; af::dim4 sig_tmp_dims, sig_tmp_strides; af::dim4 filter_tmp_dims, filter_tmp_strides; // Pack both signal and filter on same memory array, this will ensure // better use of batched cuFFT capabilities for (dim_t k = 0; k < 4; k++) { if (k < baseDim) packed_dims[k] = nextpow2((unsigned)(sd[k] + fd[k] - 1)); else if (k == baseDim) packed_dims[k] = sd[k] + fd[k]; else packed_dims[k] = 1; if (k < baseDim) { fft_dims[baseDim-k-1] = (k == 0) ? packed_dims[k] / 2 : packed_dims[k]; fftScale *= fft_dims[baseDim-k-1]; } } Array<convT> packed = createEmptyArray<convT>(packed_dims); convT *packed_ptr = packed.get(); const af::dim4 packed_strides = packed.strides(); sig_tmp_dims[0] = filter_tmp_dims[0] = packed_dims[0]; sig_tmp_strides[0] = filter_tmp_strides[0] = 1; for (dim_t k = 1; k < 4; k++) { if (k < baseDim) { sig_tmp_dims[k] = packed_dims[k]; filter_tmp_dims[k] = packed_dims[k]; } else { sig_tmp_dims[k] = sd[k]; filter_tmp_dims[k] = fd[k]; } sig_tmp_strides[k] = sig_tmp_strides[k - 1] * sig_tmp_dims[k - 1]; filter_tmp_strides[k] = filter_tmp_strides[k - 1] * filter_tmp_dims[k - 1]; } // Calculate memory offsets for packed signal and filter convT *sig_tmp_ptr = packed_ptr; convT *filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; // Number of packed complex elements in dimension 0 dim_t sig_half_d0 = divup(sd[0], 2); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s packData<convT, T>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s padArray<convT, T>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter); // Compute forward FFT if (isDouble) { fftw_plan plan = fftw_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); } else { fftwf_plan plan = fftwf_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_FORWARD, FFTW_ESTIMATE); fftwf_execute(plan); fftwf_destroy_plan(plan); } // Multiply filter and signal FFT arrays if (kind == ONE2MANY) complexMultiply<convT>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, kind); else complexMultiply<convT>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, kind); // Compute inverse FFT if (isDouble) { fftw_plan plan = fftw_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); } else { fftwf_plan plan = fftwf_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_BACKWARD, FFTW_ESTIMATE); fftwf_execute(plan); fftwf_destroy_plan(plan); } // Compute output dimensions dim4 oDims(1); if (expand) { for(dim_t d=0; d<4; ++d) { if (kind==ONE2ONE || kind==ONE2MANY) { oDims[d] = sd[d]+fd[d]-1; } else { oDims[d] = (d<baseDim ? sd[d]+fd[d]-1 : sd[d]); } } } else { oDims = sd; if (kind==ONE2MANY) { for (dim_t i=baseDim; i<4; ++i) oDims[i] = fd[i]; } } Array<T> out = createEmptyArray<T>(oDims); T* out_ptr = out.get(); const af::dim4 out_dims = out.dims(); const af::dim4 out_strides = out.strides(); const af::dim4 filter_dims = filter.dims(); // Reorder the output if (kind == ONE2MANY) { reorderOutput<T, convT, roundOut> (out_ptr, out_dims, out_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter_dims, sig_half_d0, baseDim, fftScale, expand); } else { reorderOutput<T, convT, roundOut> (out_ptr, out_dims, out_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_dims, sig_half_d0, baseDim, fftScale, expand); } return out; }
void fft(int N, double *in, int stride) { fftw_plan p; p = fftw_plan_many_dft(1,&N, 1, (fftw_complex *)in, NULL, stride, 0, (fftw_complex *)in, NULL, stride, 0, FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(p); fftw_destroy_plan(p); }
void init_gfft() { // This will init the plans needed by gfft // Transform of NY/NPROC arrays of (logical) size [NX, NZ] // The physical size is [NX, NZ+2] // We use in-place transforms int i; double complex *wi1, *whi1; double *wir1, *whir1; const int n_size2D[2] = {NX, NZ}; const int n_size1D[1] = {NY_COMPLEX}; wi1 = (double complex *) fftw_malloc( sizeof(double complex) * NTOTAL_COMPLEX); if (wi1 == NULL) ERROR_HANDLER( ERROR_CRITICAL, "No memory for wi1 allocation"); whi1 = (double complex *) fftw_malloc( sizeof(double complex) * NX*(NY/2+1)); if (whi1 == NULL) ERROR_HANDLER( ERROR_CRITICAL, "No memory for wi1 allocation"); wir1 = (double *) wi1; whir1= (double *) whi1; for(i = 0 ; i < NTOTAL_COMPLEX; i++) { wi1[i]=1.0; } #ifdef _OPENMP fftw_plan_with_nthreads( nthreads ); #endif r2c_2d = fftw_plan_many_dft_r2c(2, n_size2D, NY / NPROC, wir1, NULL, 1, (NZ+2)*NX, wi1, NULL, 1, (NZ+2)*NX/2, FFT_PLANNING); if (r2c_2d == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW R2C_2D plan creation failed"); c2r_2d = fftw_plan_many_dft_c2r(2, n_size2D, NY / NPROC, wi1, NULL, 1, (NZ+2)*NX/2, wir1, NULL, 1, (NZ+2)*NX , FFT_PLANNING); if (c2r_2d == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW C2R_2D plan creation failed"); r2cfft_2Dslice = fftw_plan_dft_r2c_2d(NX,NY,wrh3,wh3,FFT_PLANNING); //,whir1,whi1 if (r2cfft_2Dslice == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW r2c slice plan creation failed"); // 1D transforms: This are actually c2c transforms, but are used for global 3D transforms. // We will transform forward and backward an array of logical size [NX/NPROC, NY, (NZ+2)/2] along the 2nd dimension // We will do NZ_COMPLEX transforms along Y. Will need a loop on NX/NPROC // We use &w1[NZ_COMPLEX] so that alignement check is done properly (see SIMD in fftw Documentation) #ifdef _OPENMP fftw_plan_with_nthreads( 1 ); #endif r2c_1d = fftw_plan_many_dft(1, n_size1D, NZ_COMPLEX, &wi1[NZ_COMPLEX], NULL, NZ_COMPLEX, 1, &wi1[NZ_COMPLEX], NULL, NZ_COMPLEX, 1, FFTW_FORWARD, FFT_PLANNING); if (r2c_1d == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW R2C_1D plan creation failed"); c2r_1d = fftw_plan_many_dft(1, n_size1D, NZ_COMPLEX, &wi1[NZ_COMPLEX], NULL, NZ_COMPLEX, 1, &wi1[NZ_COMPLEX], NULL, NZ_COMPLEX, 1, FFTW_BACKWARD, FFT_PLANNING); if (c2r_1d == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW C2R_1D plan creation failed"); // init transpose routines init_transpose(); // Let's see which method is faster (with our without threads) fftw_free(wi1); fftw_free(whi1); fft_timer=0.0; return; }
//squares the symbol by autoconvolving the fourier coefficients //size of input FQ: nx*nz*k, size of outpout FQ2:nx*nz*(2k-1) void Qsquare(complex const *FQ,complex *FQ2,int nx,int nz,int k){ complex *FQ1; FQ1 = (complex *) calloc((2*k-1)*nx*nz,sizeof(complex)); int m,n,l; // printf(" k = %d \n",k); for (l=0;l<k;l++) {//printf("l= %d\n",l); for (m=0;m<nx;m++) { for(n=0;n<nz;n++) {//printf("%d ",l+(k)*(m*nz+n)); FQ1[l+(2*k-1)*(m*nz+n)] = FQ[l+k*(m*nz+n)]; //printf("set \n"); } } } // printf("padded array set in Qsquare \n"); int rank =1; int *dims; dims = (int*) malloc(rank*sizeof(int)); dims[0]=(2*k-1); //dims[1]=nz; dims[2]=2*k-1; int howmany = nx*nz; //inembed=onembed=dims int istride = 1; int ostride = 1; int idist = (2*k-1); int odist = idist; // printf("variables set ...\n"); /* clock_t start,end; double elapsed; start=clock(); */ fftw_plan plan_forward,plan_backward; plan_forward = fftw_plan_many_dft(rank,dims,howmany,FQ1,NULL,istride,idist,FQ1,NULL,ostride,odist,FFTW_FORWARD,FFTW_ESTIMATE); plan_backward = fftw_plan_many_dft(rank,dims,howmany,FQ1,NULL,istride,idist,FQ2,NULL,ostride,odist,FFTW_BACKWARD,FFTW_ESTIMATE); fftw_execute(plan_forward); //printf("plan executed \n"); for (l=0;l<2*k-1;l++) {//printf("l= %d\n",l); for (m=0;m<nx;m++) { for(n=0;n<nz;n++) {//printf("%d ",l+(k)*(m*nz+n)); FQ1[l+(2*k-1)*(m*nz+n)] = FQ1[l+(2*k-1)*(m*nz+n)]*FQ1[l+(2*k-1)*(m*nz+n)]/(2*k-1); //printf("set \n"); } } } /* end=clock(); elapsed = ((double)(end-start))/CLOCKS_PER_SEC; printf(" time taken to set up plans in Qsquare: %lf \n",elapsed); */ fftw_execute(plan_backward); //cleaning fftw_destroy_plan(plan_forward); fftw_destroy_plan(plan_backward); free(FQ1); }
void dgt_fac(ltfat_complex *f, ltfat_complex *gf, const int L, const int W, const int R, const int a, const int M, ltfat_complex *cout, int dotime) { /* --------- initial declarations -------------- */ int b, N, c, d, p, q, h_a, h_m; double *gbase, *fbase, *cbase; int l, k, r, s, u, w, rw, nm, mm, km; int ld2a, ld1b, ld3b; int ld4c, ld5c; int rem; fftw_plan p_before, p_after, p_veryend; double *ff, *cf, *ffp, *sbuf, *fp, *cfp; double scalconst; double st0, st1, st6, st7; /* ----------- calculation of parameters and plans -------- */ if (dotime) { st0=ltfat_time(); } b=L/M; N=L/a; c=gcd(a, M,&h_a, &h_m); p=a/c; q=M/c; d=b/p; h_a=-h_a; /* Scaling constant needed because of FFTWs normalization. */ scalconst=1.0/((double)d*sqrt((double)M)); ff = (double*)ltfat_malloc(2*d*p*q*W*sizeof(double)); cf = (double*)ltfat_malloc(2*d*q*q*W*R*sizeof(double)); sbuf = (double*)ltfat_malloc(2*d*sizeof(double)); /* Create plans. In-place. */ p_before = fftw_plan_dft_1d(d, (ltfat_complex*)sbuf, (ltfat_complex*)sbuf, FFTW_FORWARD, FFTW_MEASURE); p_after = fftw_plan_dft_1d(d, (ltfat_complex*)sbuf, (ltfat_complex*)sbuf, FFTW_BACKWARD, FFTW_MEASURE); /* Create plan. In-place. */ p_veryend = fftw_plan_many_dft(1, &M, N*R*W, cout, NULL, 1, M, cout, NULL, 1, M, FFTW_FORWARD, FFTW_OPTITYPE); if (dotime) { st1=ltfat_time(); printf("DGT_FAC_7: Planning phase %f\n",st1-st0); } /* Leading dimensions of the 4dim array. */ ld2a=2*p*q*W; /* Leading dimensions of cf */ ld1b=q*R; ld3b=2*q*R*q*W; /* --------- main loop begins here ------------------- */ for (r=0;r<c;r++) { /* ---------- compute signal factorization ----------- */ ffp=ff; fp=(double*)f+2*r; if (p==1) /* Integer oversampling case */ { for (w=0;w<W;w++) { for (l=0;l<q;l++) { for (s=0;s<d;s++) { rem = 2*((s*M+l*a)%L); sbuf[2*s] = fp[rem]; sbuf[2*s+1] = fp[rem+1]; } fftw_execute(p_before); for (s=0;s<d;s++) { ffp[s*ld2a] = sbuf[2*s]*scalconst; ffp[s*ld2a+1] = sbuf[2*s+1]*scalconst; } ffp+=2; } fp+=2*L; } fp-=2*L*W; } else { /* rational sampling case */ for (w=0;w<W;w++) { for (l=0;l<q;l++) { for (k=0;k<p;k++) { for (s=0;s<d;s++) { rem = 2*positiverem(k*M+s*p*M-l*h_a*a, L); sbuf[2*s] = fp[rem]; sbuf[2*s+1] = fp[rem+1]; } fftw_execute(p_before); for (s=0;s<d;s++) { ffp[s*ld2a] = sbuf[2*s]*scalconst; ffp[s*ld2a+1] = sbuf[2*s+1]*scalconst; } ffp+=2; } } fp+=2*L; } fp-=2*L*W; } /* ----------- compute matrix multiplication ----------- */ /* Do the matmul */ if (p==1) { /* Integer oversampling case */ /* Rational oversampling case */ for (s=0;s<d;s++) { gbase=(double*)gf+2*(r+s*c)*q*R; fbase=ff+2*s*q*W; cbase=cf+2*s*q*q*W*R; for (nm=0;nm<q*W;nm++) { for (mm=0;mm<q*R;mm++) { cbase[0]=gbase[0]*fbase[0]+gbase[1]*fbase[1]; cbase[1]=gbase[0]*fbase[1]-gbase[1]*fbase[0]; gbase+=2; cbase+=2; } gbase-=2*q*R; fbase+=2; } cbase-=2*q*R*q*W; } } else { /* Rational oversampling case */ for (s=0;s<d;s++) { gbase=(double*)gf+2*(r+s*c)*p*q*R; fbase=ff+2*s*p*q*W; cbase=cf+2*s*q*q*W*R; for (nm=0;nm<q*W;nm++) { for (mm=0;mm<q*R;mm++) { cbase[0]=0.0; cbase[1]=0.0; for (km=0;km<p;km++) { cbase[0]+=gbase[0]*fbase[0]+gbase[1]*fbase[1]; cbase[1]+=gbase[0]*fbase[1]-gbase[1]*fbase[0]; gbase+=2; fbase+=2; } fbase-=2*p; cbase+=2; } gbase-=2*q*R*p; fbase+=2*p; } cbase-=2*q*R*q*W; fbase-=2*p*q*W; } } /* ------- compute inverse coefficient factorization ------- */ cfp=cf; ld4c=M*N; ld5c=M*N*R; /* Cover both integer and rational sampling case */ for (w=0;w<W;w++) { /* Complete inverse fac of coefficients */ for (l=0;l<q;l++) { for (rw=0;rw<R;rw++) { for (u=0;u<q;u++) { for (s=0;s<d;s++) { sbuf[2*s] = cfp[s*ld3b]; sbuf[2*s+1] = cfp[s*ld3b+1]; } cfp+=2; /* Do inverse fft of length d */ fftw_execute(p_after); for (s=0;s<d;s++) { rem= r+l*c+positiverem(u+s*q-l*h_a,N)*M+rw*ld4c+w*ld5c; cout[rem][0]=sbuf[2*s]; cout[rem][1]=sbuf[2*s+1]; } } } } } /* ----------- Main loop ends here ------------------------ */ } if (dotime) { st6=ltfat_time(); printf("DGT_FAC_7: Main loop done %f\n",st6-st1); } /* FFT to modulate the coefficients. */ fftw_execute(p_veryend); if (dotime) { st7=ltfat_time(); printf("DGT_FAC_7: Final FFT %f\n",st7-st6); printf("DGT_FAC_7: Total time %f\n",st7-st0); } /* ----------- Clean up ----------------- */ fftw_destroy_plan(p_before); fftw_destroy_plan(p_after); fftw_destroy_plan(p_veryend); ltfat_free(sbuf); ltfat_free(ff); ltfat_free(cf); }
// This is an operator that applies (A - shift*B) on a vector using the FFT void SPB::BandSolver_Ez::OpForw(size_t n, const complex_t &shift, const complex_t *x, complex_t *y) const{ const int Ngrid = res[0]*res[1]; complex_t *t = (complex_t*)fftw_malloc(sizeof(complex_t)*n); // Data layout: Hx, Hy, Ez, divH fftw_plan plan_forward = fftw_plan_many_dft( 2/*rank*/, res, 4 /*howmany*/, (fftw_complex*)t, NULL/*inembed*/, 1/*istride*/, Ngrid/*idist*/, (fftw_complex*)t, NULL/*onembed*/, 1/*ostride*/, Ngrid/*odist*/, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_plan plan_backward = fftw_plan_many_dft( 2/*rank*/, res, 4 /*howmany*/, (fftw_complex*)t, NULL/*inembed*/, 1/*istride*/, Ngrid/*idist*/, (fftw_complex*)t, NULL/*onembed*/, 1/*ostride*/, Ngrid/*odist*/, FFTW_FORWARD, FFTW_ESTIMATE); const double kshiftsign = 1.0; RNP::TBLAS::Copy(n, x,1, t,1); for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ double phase = kshiftsign*2*M_PI*(last_k[0]*((double)i)/res[0] + last_k[1]*((double)j)/res[1]); t[HX_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); t[HY_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); t[EZ_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); t[DIVH_OFF+IDX(i,j)] *= complex_t(cos(phase), sin(phase)); } } fftw_execute(plan_forward); for(int i = 0; i < res[0]; ++i){ const int fi = (i > res[0]/2 ? i-res[0] : i); for(int j = 0; j < res[1]; ++j){ const int fj = (j > res[1]/2 ? j-res[1] : j); double kpG[2] = { (L.Lk[0]*(last_k[0]+fi) + L.Lk[2]*(last_k[1]+fj)), (L.Lk[1]*(last_k[0]+fi) + L.Lk[3]*(last_k[1]+fj)) }; kpG[0] *= 2*M_PI; kpG[1] *= 2*M_PI; const double klen2 = kpG[0]*kpG[0] + kpG[1]*kpG[1]; const double klen = sqrt(klen2); if(klen < std::numeric_limits<double>::epsilon() * L.CharacteristicKLength()){ continue; } // [ -q mu k x k 0 0 ] [ H ] [ H ] // [ -k x -q eps 0 0 -i wp ] [ E ] = [ E ] // [ k. 0 0 0 0 ] [dvH] = [dvH] // [ 0 0 0 -q eta i w0 ] [ P ] [ P ] // [ 0 i wp 0 -i w0 eta -q eta ] [ V ] [ V ] // given /* // Forward and backward differences #define FDIFF(VEC,D) ((std::exp(complex_t(0,-(VEC)[D]/res[D]))-1.) * (double)res[D]) #define BDIFF(VEC,D) ((1.-std::exp(complex_t(0,(VEC)[D]/res[D]))) * (double)res[D]) static const complex_t I(0.,1.); y[HX_OFF + IDX(i,j)] = -I*FDIFF(kpG,1)*t[EZ_OFF + IDX(i,j)] + I*BDIFF(kpG,0)*t[DIVH_OFF+IDX(i,j)]; y[HX_OFF + IDX(i,j)] /= ((double)Ngrid); y[HY_OFF + IDX(i,j)] = I*FDIFF(kpG,0)*t[EZ_OFF + IDX(i,j)] + I*BDIFF(kpG,1)*t[DIVH_OFF+IDX(i,j)]; y[HY_OFF + IDX(i,j)] /= ((double)Ngrid); y[EZ_OFF + IDX(i,j)] = -I*BDIFF(kpG,1)*t[HX_OFF + IDX(i,j)] + I*BDIFF(kpG,0)*t[HY_OFF + IDX(i,j)]; y[EZ_OFF + IDX(i,j)] /= ((double)Ngrid); y[DIVH_OFF+IDX(i,j)] = I*FDIFF(kpG,0)*t[HX_OFF + IDX(i,j)] + I*FDIFF(kpG,1)*t[HY_OFF + IDX(i,j)]; y[DIVH_OFF+IDX(i,j)] /= ((double)Ngrid); y[HX_OFF + IDX(i,j)] -= shift*t[HX_OFF + IDX(i,j)]/((double)Ngrid); y[HY_OFF + IDX(i,j)] -= shift*t[HY_OFF + IDX(i,j)]/((double)Ngrid); y[EZ_OFF + IDX(i,j)] -= shift*t[EZ_OFF + IDX(i,j)]/((double)Ngrid); */ static const complex_t I(0.,1.); const size_t n_res = 0; const size_t nh = 4+2*n_res; complex_t *A = new complex_t[nh*nh+2*nh]; complex_t *b = A+nh*nh; complex_t *c = b+nh; memset(A, 0, sizeof(complex_t)*nh*nh); A[0+2*nh] = -I*FDIFF(kpG,1); A[0+3*nh] = I*BDIFF(kpG,0); A[1+2*nh] = I*FDIFF(kpG,0); A[1+3*nh] = I*BDIFF(kpG,1); A[2+0*nh] = -I*BDIFF(kpG,1); A[2+1*nh] = I*BDIFF(kpG,0); A[3+0*nh] = I*FDIFF(kpG,0); A[3+1*nh] = I*FDIFF(kpG,1); A[0+0*nh] = -shift; A[1+1*nh] = -shift; A[2+2*nh] = -shift; b[0] = t[HX_OFF + IDX(i,j)]; b[1] = t[HY_OFF + IDX(i,j)]; b[2] = t[EZ_OFF + IDX(i,j)]; b[3] = t[DIVH_OFF+IDX(i,j)]; RNP::TBLAS::MultMV<'N'>(nh,nh, 1.,A,nh, b,1, 0.,c,1); y[HX_OFF + IDX(i,j)] = c[0] / ((double)Ngrid); y[HY_OFF + IDX(i,j)] = c[1] / ((double)Ngrid); y[EZ_OFF + IDX(i,j)] = c[2] / ((double)Ngrid); y[DIVH_OFF+IDX(i,j)] = c[3] / ((double)Ngrid); delete [] A; } } RNP::TBLAS::Copy(n, y,1, t,1); fftw_execute(plan_backward); fftw_destroy_plan(plan_forward); fftw_destroy_plan(plan_backward); RNP::TBLAS::Copy(n, t,1, y,1); for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ double phase = -kshiftsign*2*M_PI*(last_k[0]*((double)i)/res[0] + last_k[1]*((double)j)/res[1]); y[HX_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[HY_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[EZ_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[DIVH_OFF+IDX(i,j)] *= complex_t(cos(phase), sin(phase)); /* y[HX_OFF + IDX(i,j)] -= shift*x[HX_OFF + IDX(i,j)]; y[HY_OFF + IDX(i,j)] -= shift*x[HY_OFF + IDX(i,j)]; y[EZ_OFF + IDX(i,j)] -= shift*x[EZ_OFF + IDX(i,j)];*/ } } fftw_free(t); }
int SPB::BandSolver_Ez::SolveK(const double *k){ SPB_VERB(1, "Solving k-point (%.14g, %.14g)\n", k[0], k[1]); ClearSolution(); last_k[0] = k[0]; last_k[1] = k[1]; // Prepare the indexing const size_t Ngrid = res[0] * res[1]; if(impl->structure_changed_since_last_solve){ free(impl->ind); impl->ind = (int*)malloc(sizeof(int) * 2*Ngrid); fftw_free(impl->eps_z_fft); impl->eps_z_fft = (complex_t*)fftw_malloc(sizeof(complex_t)*Ngrid); size_t next_index = 0; for(int i = 0; i < res[0]; ++i){ const double fi = ((double)i/(double)res[0]) - 0.5; for(int j = 0; j < res[1]; ++j){ const double fj = ((double)j/(double)res[1]) - 0.5; impl->ind[2*IDX(i,j)+0] = 4*Ngrid+next_index; // get material of this cell (simple pointwise check) int tag, num_poles; //if(2 == dim){ double p[2] = { L.Lr[0]*fi + L.Lr[2]*fj, L.Lr[1]*fi + L.Lr[3]*fj }; if(!shapeset.QueryPt(p, &tag)){ tag = -1; } /*}else{ double p[3] = { L.Lr[0]*fi + L.Lr[3]*fj + L.Lr[6]*fk, L.Lr[1]*fi + L.Lr[4]*fj + L.Lr[7]*fk, L.Lr[2]*fi + L.Lr[5]*fj + L.Lr[8]*fk }; if(ShapeSet3_query_pt(shapeset.d3, p, NULL, &tag)){ }else{ tag = -1; } }*/ if(-1 == tag){ num_poles = 0; impl->eps_z_fft[IDX(i,j)] = 1.; }else{ num_poles = material[tag].poles.size(); impl->eps_z_fft[IDX(i,j)] = material[tag].eps_inf.value[8]; std::cout << i << "\t" << j << "\t" << impl->eps_z_fft[IDX(i,j)] << "\t" << num_poles << std::endl; } impl->ind[2*IDX(i,j)+1] = tag; // update next index next_index += 2*num_poles; } } //impl->N = 4*Ngrid + 3*zero_constraint + next_index; impl->N = 4*Ngrid + next_index; /* switch(pol){ case 1: // Hx,Hy,Ez, divH N = (3+1)*Ngrid + 3*zero_constraint + next_index; break; case 2: // Hz,Ex,Ey (Hz is already div-free) N = (3+0)*Ngrid + 3*zero_constraint + next_index; break; default: // Hx,Hy,Hz,Ex,Ey,Ez, divH N = (6+1)*Ngrid + 6*zero_constraint + next_index; break; }*/ fftw_plan plan_eps = fftw_plan_many_dft( 2/*rank*/, res, 1 /*howmany*/, (fftw_complex*)impl->eps_z_fft, NULL/*inembed*/, 1/*istride*/, Ngrid/*idist*/, (fftw_complex*)impl->eps_z_fft, NULL/*onembed*/, 1/*ostride*/, Ngrid/*odist*/, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_execute(plan_eps); fftw_destroy_plan(plan_eps); impl->structure_changed_since_last_solve = false; } sparse_t::entry_map_t Amap; sparse_t::entry_map_t Bmap; { const double Lrl[2] = { hypot(L.Lr[0], L.Lr[1]), hypot(L.Lr[2], L.Lr[3]) }; const double idr[2] = { (double)res[0] / Lrl[0], (double)res[1] / Lrl[1] }; const complex_t Bloch[2] = { complex_t(cos(k[0]*2*M_PI), sin(k[0]*2*M_PI)), complex_t(cos(k[1]*2*M_PI), sin(k[1]*2*M_PI)) }; for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ size_t row, col; complex_t coeff; const int curmat = impl->ind[2*IDX(i,j)+1]; complex_t eps_z(1.); if(curmat >= 0){ eps_z = material[curmat].eps_inf.value[8]; } #define ASET(ROW,COL,COEFF) Amap[sparse_t::index_t((ROW),(COL))] = (COEFF) #define BSET(ROW,COL,COEFF) Bmap[sparse_t::index_t((ROW),(COL))] = (COEFF) // divH ~ dx Hx + dy Hy + dz Hz // E ~ -i wp V // V ~ +i wp E - i G V - i w0 P // P ~ +i w0 V //for(size_t idbg=0;idbg<ne+nh+1;++idbg){ //ASET(row0+idbg,row0+idbg,1); // for debugging //} // Hx ~ -i dy Ez // Hy ~ +i dx Ez // Ez ~ -i dy Hx + i dx Hy // Hx = complex_t(0,-idr[1]) * (Ez[i,j+1,k] - Ez[i,j,k]) row = HX_OFF + IDX(i,j); coeff = complex_t(0,-idr[1]); col = EZ_OFF + IDX(i,j); // Ez ASET(row,col, -coeff); if(j+1 == res[1]){ col = EZ_OFF + IDX(i,0); // Ez ASET(row,col, coeff/Bloch[1]); }else{ col = EZ_OFF + IDX(i,j+1); // Ez ASET(row,col, coeff); } BSET(row,row, 1); // Hy = complex_t(0, idr[0]) * (Ez[i+1,j,k] - Ez[i,j,k]) row = HY_OFF + IDX(i,j); coeff = complex_t(0, idr[0]); col = EZ_OFF + IDX(i,j); // Ez ASET(row,col, -coeff); if(i+1 == res[0]){ col = EZ_OFF + IDX(0,j); // Ez ASET(row,col, coeff/Bloch[0]); }else{ col = EZ_OFF + IDX(i+1,j); // Ez ASET(row,col, coeff); } BSET(row,row, 1); // divH = idr[0] * (Hx[i+1,j,k] - Hx[i,j,k]) // + idr[1] * (Hy[i,j+1,k] - Hx[i,j,k]) row = DIVH_OFF + IDX(i,j); coeff = complex_t(0,idr[0]); col = HX_OFF + IDX(i,j); // Hx ASET(row,col, -coeff); ASET(col,row, -std::conj(coeff)); if(i+1 == res[0]){ col = HX_OFF + IDX(0,j); // Hx ASET(row,col, coeff/Bloch[0]); ASET(col,row, std::conj(coeff/Bloch[0])); }else{ col = HX_OFF + IDX(i+1,j); // Hx ASET(row,col, coeff); ASET(col,row, std::conj(coeff)); } coeff = complex_t(0,idr[1]); col = HY_OFF + IDX(i,j); // Hy ASET(row,col, -coeff); ASET(col,row, -std::conj(coeff)); if(j+1 == res[1]){ col = HY_OFF + IDX(i,0); // Hy ASET(row,col, coeff/Bloch[1]); ASET(col,row, std::conj(coeff/Bloch[1])); }else{ col = HY_OFF + IDX(i,j+1); // Hy ASET(row,col, coeff); ASET(col,row, std::conj(coeff)); } BSET(row,row, 0); // Ez = complex_t(0,-idr[1]) * (Hx[i,j,k] - Hx[i,j-1,k]) // + complex_t(0, idr[0]) * (Hy[i,j,k] - Hy[i-1,j,k]) row = EZ_OFF + IDX(i,j); coeff = complex_t(0,-idr[1]); col = HX_OFF + IDX(i,j); // Hx ASET(row,col, coeff); if(0 == j){ col = HX_OFF + IDX(i,res[1]-1); // Hx ASET(row,col, -coeff*Bloch[1]); }else{ col = HX_OFF + IDX(i,j-1); // Hx ASET(row,col, -coeff); } coeff = complex_t(0, idr[0]); col = HY_OFF + IDX(i,j); // Hy ASET(row,col, coeff); if(0 == i){ col = HY_OFF + IDX(res[0]-1,j); // Hy ASET(row,col, -coeff*Bloch[0]); }else{ col = HY_OFF + IDX(i-1,j); // Hy ASET(row,col, -coeff); } BSET(row,row, eps_z); if(curmat >= 0){ const int row0 = impl->ind[2*IDX(i,j)+0]; const Material &m = material[curmat]; const size_t np = m.poles.size(); for(size_t p = 0; p < np; ++p){ row = row0 + 2*p + 0; // V_p coeff = complex_t(0, m.poles[p].omega_p) * eps_z; col = EZ_OFF + IDX(i,j); // E ASET(row,col, coeff); ASET(col,row, std::conj(coeff)); if(0 != m.poles[p].Gamma){ coeff = complex_t(0,-m.poles[p].Gamma) * eps_z; ASET(row,row, coeff); } BSET(row,row, 1); coeff = complex_t(0, -m.poles[p].omega_0) * eps_z; col = row0 + 2*p + 1; // P ASET(row,col, coeff); ASET(col,row, std::conj(coeff)); BSET(col,col, 1); } } /* }else if(2 == pol){ // Hz ~ +i dy Ex - i dx Ey // Ex ~ +i dy Hz // Ey ~ -i dx Hz }else{ // Hx ~ +i dz Ey - i dy Ez // Hy ~ -i dz Ex + i dx Ez // Hz ~ +i dy Ex - i dx Ey // Ex ~ -i dz Hy + i dy Hz // Ey ~ +i dz Hx - i dx Hz // Ez ~ -i dy Hx + i dx Hy }*/ } } } impl->A = new sparse_t(impl->N,impl->N, Amap); impl->B = new sparse_t(impl->N,impl->N, Bmap); if(0){ std::cout << "A="; RNP::Sparse::PrintSparseMatrix(*(impl->A)) << ";" << std::endl; std::cout << "B="; RNP::Sparse::PrintSparseMatrix(*(impl->B)) << ";" << std::endl; exit(0); } /* complex_t *tmp = new complex_t[4*Ngrid]; complex_t *tmp2 = new complex_t[16*Ngrid*Ngrid]; for(size_t i = 0; i < res[0]; ++i){ for(size_t j = 0; j < res[1]; ++j){ tmp[IDX(i,j)] = 0; } } for(size_t i = 0; i < res[0]; ++i){ for(size_t j = 0; j < res[1]; ++j){ tmp[IDX(i,j)] = 1; Precond(tmp, &tmp2[0+IDX(i,j)*Ngrid]); tmp[IDX(i,j)] = 0; } } delete [] tmp2; delete [] tmp; */ return solver->Solve(); /* { const size_t n = 4*Ngrid; complex_t *x = (complex_t*)fftw_malloc(sizeof(complex_t)*n); complex_t *y = (complex_t*)fftw_malloc(sizeof(complex_t)*n); complex_t *z = (complex_t*)fftw_malloc(sizeof(complex_t)*n); const double theta = 0.6; memset(x, 0, sizeof(complex_t)*n); for(int i = 0; i < n; ++i){ x[i] = frand(); } std::cout << "x = "; RNP::IO::PrintVector(n, x, 1) << std::endl; Aop(x, y); Bop(x, z); RNP::TBLAS::Axpy(n, -theta, z,1, y,1); // At this point y = A*x-theta*B*x std::cout << "y = "; RNP::IO::PrintVector(n, y, 1) << std::endl; Op(n, theta, y, z); // At this point z should be the same as x std::cout << "z = "; RNP::IO::PrintVector(n, z, 1) << std::endl; RNP::TBLAS::Axpy(n, -1., x,1,z,1); std::cout << "diff = "; RNP::IO::PrintVector(n, z, 1) << std::endl; fftw_free(z); fftw_free(y); fftw_free(x); }*/ /* size_t n_wanted = 10; size_t ncv = 2*n_wanted+1; SPB::complex_t *w = new SPB::complex_t[n_wanted+ncv*4*Ngrid]; SPB::complex_t *v = w+n_wanted; int nconv = RNP::IRA::ShiftInvert( 4*Ngrid, 0.0, &op_, &bv_, n_wanted, ncv, &RNP::IRA::LargestMagnitude, w, v, 4*Ngrid, NULL, NULL, (void*)this, (void*)this); for(size_t i = 0; i < n_wanted;++i){ std::cout << w[i] << std::endl; } */ }
bool internal_fftw(Mda &X, integer dim, integer do_inverse_transform) { QTime time; time.start(); int d1=(int)dim-1; qint64 N=X.size(); if (!N) return false; if ((d1>=X.dimCount())||(d1<0)) { qWarning() << "Dimension out of range"; return false; } fftw_complex *in; fftw_plan p; in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N); //out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * N); int n[MAX_MDA_DIMS]; qint32 j,k; n[0]=X.size(d1); int istride=1; int idist=X.size(d1); int howmany=N/idist; if (do_inverse_transform) { p = fftw_plan_many_dft(1, n, howmany, in, NULL,istride, idist, in, NULL,istride, idist, FFTW_BACKWARD, FFTW_ESTIMATE); } else { p = fftw_plan_many_dft(1, n, howmany, in, NULL,istride, idist, in, NULL,istride, idist, FFTW_FORWARD, FFTW_ESTIMATE); } long factor1=1; long factor2=1; for (int dimind=0; dimind<d1; dimind++) { factor1*=X.size(dimind); } for (int dimind=d1; dimind<X.dimCount(); dimind++) { factor2*=X.size(dimind); } qint32 ind[MAX_MDA_DIMS]; bool done; if (X.data_real) { real *D=X.data_real; long ct=0; for (long j1=0; j1<factor1; j1++) { for (long j2=0; j2<factor2; j2++) { in[ct][0]=D[j1+factor1*j2]; in[ct][1]=0; ct++; }} } else if (X.data_complex) { complex_struct *D=X.data_complex; long ct=0; for (long j1=0; j1<factor1; j1++) { for (long j2=0; j2<factor2; j2++) { in[ct][0]=D[j1+factor1*j2].re; in[ct][1]=D[j1+factor1*j2].im; ct++; }} } else { long ct=0; for (long j1=0; j1<factor1; j1++) { for (long j2=0; j2<factor2; j2++) { in[ct][0]=X[j1+factor1*j2].re(); in[ct][1]=X[j1+factor1*j2].im(); ct++; }} } //printf("fftw_execute..."); fftw_execute(p); //printf("fftw_execute finished."); if (X.dataType()!=MDA_TYPE_COMPLEX) { //printf("Converting to complex..."); X.convertToComplex(); //printf("Done."); } if (X.data_complex) { complex_struct *D=X.data_complex; long ct=0; for (long j2=0; j2<factor2; j2++) { for (long j1=0; j1<factor1; j1++) { D[ct].re=in[j2+factor2*j1][0]; D[ct].im=in[j2+factor2*j1][1]; ct++; }} } else { long ct=0; for (long j2=0; j2<factor2; j2++) { for (long j1=0; j1<factor1; j1++) { X[ct]=Complex(in[j2+factor2*j1][0],in[j2+factor2*j1][1]); ct++; }} } fftw_destroy_plan(p); fftw_free(in); if (do_inverse_transform) { real factor=1.0F/X.size(d1); if (X.data_complex) { complex_struct *D=X.data_complex; for (j=0; j<X.size(); j++) { D[j].re*=factor; D[j].im*=factor; } } else { for (qint64 j=0; j<X.size(); j++) X[j]=X[j]*factor; } } return true; }
struct fft_plan_3d *fft_3d_create_plan( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; /* query MPI info */ MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); /* compute division of procs in 2 dimensions not on-processor */ bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; /* allocate memory for plan data struct */ plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; /* remap from initial distribution to layout needed for 1st set of 1d FFTs not needed if all procs own entire fast axis initially first indices = distribution after 1st set of FFTs */ if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, FFT_PRECISION,0,0,2); if (plan->pre_plan == NULL) return NULL; } /* 1d FFTs along fast axis */ plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); /* remap from 1st to 2nd FFT choose which axis is split over np1 vs np2 to minimize communication second indices = distribution after 2nd set of FFTs */ second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, FFT_PRECISION,1,0,2); if (plan->mid1_plan == NULL) return NULL; /* 1d FFTs along mid axis */ plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); /* remap from 2nd to 3rd FFT if final distribution is permute=2 with all procs owning entire slow axis then this remapping goes directly to final distribution third indices = distribution after 3rd set of FFTs */ if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, FFT_PRECISION,1,0,2); if (plan->mid2_plan == NULL) return NULL; /* 1d FFTs along slow axis */ plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; /* remap from 3rd FFT to final distribution not needed if permute = 2 and third indices = out indices on all procs */ if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, FFT_PRECISION,(permute+1)%3,0,2); if (plan->post_plan == NULL) return NULL; } /* configure plan memory pointers and allocate work space out_size = amount of memory given to FFT by user first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps copy_size = amount needed internally for extra copy of data scratch_size = amount needed internally for remap scratch space for each remap: use out space for result if big enough, else require copy buffer accumulate largest required remap scratch space */ out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; /* system specific pre-computation of 1d FFT coeffs and scaling normalization */ plan->plan_fast_forward = fftw_plan_many_dft(1,&(plan->length1),plan->total1/plan->length1, plan->scratch,NULL,1,plan->length1,plan->scratch, NULL,1,plan->length1,FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = fftw_plan_many_dft(1,&(plan->length1),plan->total1/plan->length1, plan->scratch,NULL,1,plan->length1,plan->scratch, NULL,1,plan->length1,FFTW_BACKWARD,FFTW_ESTIMATE); if (plan->length2 == plan->length1) { plan->plan_mid_forward = plan->plan_fast_forward; plan->plan_mid_backward = plan->plan_fast_backward; } else { plan->plan_mid_forward = fftw_plan_many_dft(1,&(plan->length2),plan->total2/plan->length2, plan->scratch,NULL,1,plan->length2,plan->scratch, NULL,1,plan->length2,FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = fftw_plan_many_dft(1,&(plan->length2),plan->total2/plan->length2, plan->scratch,NULL,1,plan->length2,plan->scratch, NULL,1,plan->length2,FFTW_BACKWARD,FFTW_ESTIMATE); } if (plan->length3 == plan->length1) { plan->plan_slow_forward = plan->plan_fast_forward; plan->plan_slow_backward = plan->plan_fast_backward; } else if (plan->length3 == plan->length2) { plan->plan_slow_forward = plan->plan_mid_forward; plan->plan_slow_backward = plan->plan_mid_backward; } else { plan->plan_slow_forward = fftw_plan_many_dft(1,&(plan->length3),plan->total3/plan->length3, plan->scratch,NULL,1,plan->length3,plan->scratch, NULL,1,plan->length3,FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = fftw_plan_many_dft(1,&(plan->length3),plan->total3/plan->length3, plan->scratch,NULL,1,plan->length3,plan->scratch, NULL,1,plan->length3,FFTW_BACKWARD,FFTW_ESTIMATE); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } return plan; }
bool do_fft_1d_r2c(int M, int N, float* out, float* in) { /* if (num_threads>1) { fftw_init_threads(); fftw_plan_with_nthreads(num_threads); } */ int MN = M * N; fftw_complex* in2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * MN); fftw_complex* out2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * MN); for (int ii = 0; ii < MN; ii++) { //in2[ii][0]=in[ii*2]; //in2[ii][1]=in[ii*2+1]; in2[ii][0] = in[ii]; in2[ii][1] = 0; } /* * From FFTW docs: * howmany is the number of transforms to compute. * The resulting plan computes howmany transforms, * where the input of the k-th transform is at * location in+k*idist (in C pointer arithmetic), * and its output is at location out+k*odist. * Plans obtained in this way can often be faster * than calling FFTW multiple times for the individual * transforms. The basic fftw_plan_dft interface corresponds * to howmany=1 (in which case the dist parameters are ignored). * * Each of the howmany transforms has rank rank * and size n, as in the basic interface. * In addition, the advanced interface allows the * input and output arrays of each transform to be * row-major subarrays of larger rank-rank arrays, * described by inembed and onembed parameters, * respectively. {i,o}nembed must be arrays of length * rank, and n should be elementwise less than or equal * to {i,o}nembed. Passing NULL for an nembed parameter * is equivalent to passing n (i.e. same physical and * logical dimensions, as in the basic interface.) * * The stride parameters indicate that the j-th element * of the input or output arrays is located at j*istride * or j*ostride, respectively. (For a multi-dimensional array, * j is the ordinary row-major index.) When combined with * the k-th transform in a howmany loop, from above, this * means that the (j,k)-th element is at j*stride+k*dist. * (The basic fftw_plan_dft interface corresponds to a stride * of 1.) */ fftw_plan p; int rank = 1; int n[] = { N }; int howmany = M; int* inembed = n; int istride = M; int idist = 1; int* onembed = n; int ostride = M; int odist = 1; int sign = FFTW_FORWARD; unsigned flags = FFTW_ESTIMATE; #pragma omp critical p = fftw_plan_many_dft(rank, n, howmany, in2, inembed, istride, idist, out2, onembed, ostride, odist, sign, flags); //p=fftw_plan_dft_1d(N,in2,out2,FFTW_FORWARD,FFTW_ESTIMATE); fftw_execute(p); for (int ii = 0; ii < MN; ii++) { out[ii * 2] = out2[ii][0]; out[ii * 2 + 1] = out2[ii][1]; } fftw_free(in2); fftw_free(out2); /* if (num_threads>1) { fftw_cleanup_threads(); } */ #pragma omp critical fftw_destroy_plan(p); return true; }
void SPB::BandSolver_Ez::ShiftInv(const complex_t &shift, const complex_t *x, complex_t *y) const{ const int Ngrid = res[0]*res[1]; size_t n = 4*Ngrid; RNP::TBLAS::Copy(n, x,1, y,1); // Invert V and P fields // For zero shift, the main E/H field block matrix inverse is unchanged // v = inv(D) h // u = inv(C - W^H inv(D) W) (g - W^H v) for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ const int tag = impl->ind[2*IDX(i,j)+1]; if(tag < 0){ continue; } const int row0 = impl->ind[2*IDX(i,j)+0]; const Material &mat = material[tag]; const int np = mat.poles.size(); for(int p = 0; p < np; ++p){ const LorentzPole &pole = mat.poles[p]; const complex_t iwp = complex_t(0,pole.omega_p) * mat.eps_inf.value[8]; const complex_t i_w0 = complex_t(0,1./pole.omega_0) / mat.eps_inf.value[8]; const complex_t iG_w0w0 = i_w0 * pole.Gamma / pole.omega_0; y[row0 + 2*p + 0] = -i_w0 * x[row0 + 2*p + 1]; y[row0 + 2*p + 1] = i_w0 * x[row0 + 2*p + 0] + iG_w0w0 * x[row0 + 2*p + 1]; y[EZ_OFF + IDX(i,j)] += iwp * y[row0 + 2*p + 0]; } } } // Data layout: divH, Ez, Hx, Hy fftw_plan plan_forward = fftw_plan_many_dft( 2/*rank*/, res, 4 /*howmany*/, (fftw_complex*)y, NULL/*inembed*/, 1/*istride*/, Ngrid/*idist*/, (fftw_complex*)y, NULL/*onembed*/, 1/*ostride*/, Ngrid/*odist*/, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_plan plan_backward = fftw_plan_many_dft( 2/*rank*/, res, 4 /*howmany*/, (fftw_complex*)y, NULL/*inembed*/, 1/*istride*/, Ngrid/*idist*/, (fftw_complex*)y, NULL/*onembed*/, 1/*ostride*/, Ngrid/*odist*/, FFTW_FORWARD, FFTW_ESTIMATE); const double kshiftsign = 1.0; for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ double phase = kshiftsign*2*M_PI*(last_k[0]*(double)i/res[0] + last_k[1]*(double)j/res[1]); y[HX_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[HY_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[EZ_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[DIVH_OFF+IDX(i,j)] *= complex_t(cos(phase), sin(phase)); } } fftw_execute(plan_forward); complex_t A[4*4], b[4]; size_t ipiv[4]; for(int i = 0; i < res[0]; ++i){ const int fi = (i > res[0]/2 ? i-res[0] : i); for(int j = 0; j < res[1]; ++j){ const int fj = (j > res[1]/2 ? j-res[1] : j); double kpG[2] = { (L.Lk[0]*(last_k[0]+fi) + L.Lk[2]*(last_k[1]+fj)), (L.Lk[1]*(last_k[0]+fi) + L.Lk[3]*(last_k[1]+fj)) }; kpG[0] *= 2*M_PI; kpG[1] *= 2*M_PI; const double klen2 = kpG[0]*kpG[0] + kpG[1]*kpG[1]; const double klen = sqrt(klen2); // At the Gamma point, project out the constant basis vector if(klen < std::numeric_limits<double>::epsilon() * L.CharacteristicKLength()){ y[DIVH_OFF+IDX(i,j)] = 0; y[EZ_OFF + IDX(i,j)] = 0; y[HX_OFF + IDX(i,j)] = 0; y[HY_OFF + IDX(i,j)] = 0; continue; } // [ 0 0 k. 0 0 ] [dvH] = [dvH] // [ 0 -q eps -k x 0 -i wp ] [ E ] = [ E ] // [ k k x -q mu 0 0 ] [ H ] [ H ] // [ 0 0 0 -q eta i w0 ] [ P ] [ P ] // [ 0 i wp 0 -i w0 eta -q eta ] [ V ] [ V ] // given memset(A, 0, sizeof(complex_t)*4*4); // Forward and backward differences #define FDIFF(VEC,D) ((std::exp(complex_t(0,-(VEC)[D]/res[D]))-1.) * (double)res[D]) #define BDIFF(VEC,D) ((1.-std::exp(complex_t(0,(VEC)[D]/res[D]))) * (double)res[D]) static const complex_t I(0.,1.); A[2+1*4] = -I*FDIFF(kpG,1); A[2+0*4] = I*BDIFF(kpG,0); A[3+1*4] = I*FDIFF(kpG,0); A[3+0*4] = I*BDIFF(kpG,1); A[1+2*4] = -I*BDIFF(kpG,1); A[1+3*4] = I*BDIFF(kpG,0); A[0+2*4] = I*FDIFF(kpG,0); A[0+3*4] = I*FDIFF(kpG,1); b[0] = y[DIVH_OFF+IDX(i,j)]; b[1] = y[EZ_OFF + IDX(i,j)]; b[2] = y[HX_OFF + IDX(i,j)]; b[3] = y[HY_OFF + IDX(i,j)]; RNP::LinearSolve<'N'>(4,1, A,4, b,4); y[DIVH_OFF+IDX(i,j)] = b[0] / ((double)Ngrid); y[EZ_OFF + IDX(i,j)] = b[1] / ((double)Ngrid); y[HX_OFF + IDX(i,j)] = b[2] / ((double)Ngrid); y[HY_OFF + IDX(i,j)] = b[3] / ((double)Ngrid); } } fftw_execute(plan_backward); fftw_destroy_plan(plan_forward); fftw_destroy_plan(plan_backward); for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ double phase = -kshiftsign*2*M_PI*(last_k[0]*(double)i/res[0] + last_k[1]*(double)j/res[1]); y[DIVH_OFF+IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[EZ_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[HX_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); y[HY_OFF + IDX(i,j)] *= complex_t(cos(phase), sin(phase)); } } // v += inv(D) W u for(int i = 0; i < res[0]; ++i){ for(int j = 0; j < res[1]; ++j){ const int tag = impl->ind[2*IDX(i,j)+1]; if(tag < 0){ continue; } const int row0 = impl->ind[2*IDX(i,j)+0]; const Material &mat = material[tag]; const int np = mat.poles.size(); for(int p = 0; p < np; ++p){ const LorentzPole &pole = mat.poles[p]; y[row0 + 2*p + 0] -= (pole.omega_p/pole.omega_0) * y[EZ_OFF + IDX(i,j)]; } } } }
/* \brief Create FFT Plan * */ void CFFTScalar::CreatePlan(int nSign, bool* bDimTrans) { //If plan already exists, destroy it for the new one if(_bPlanSet) fftw_destroy_plan(_plan); //Set default values for rank and how many _nRank=0; for (int i=0;i<3;i++) _nN[i]=1; _nHowMany=1; _nInembed=NULL; _nOnembed=NULL; _nIstride=1; _nIdist=1; _nOstride=1; _nOdist=1; //determine how many ffts to perform and size of ffts _nHowMany=_myGrid.GetTotal(); int myN{0}; for(int i=_myGrid.GetDim()-1;i>=0;i--) { if(bDimTrans[i]) { _nHowMany/=_myGrid.GetSize(i); _nN[myN]=_myGrid.GetSize(i); myN++; _nIdist*=_myGrid.GetSize(i); _nRank++; } /*else { _nIstride*=_myGrid.GetSize(i); }*/ } _nOdist=_nIdist; _nOstride=_nIstride; //from manual _nInembed=_nN; _nOnembed=_nN; /* std::cout<<"_nRank " << _nRank<< "\n"; std::cout<<"_nN " << _nN[0]<< "\n"; std::cout<<"_nN " << _nN[1]<< "\n"; std::cout<<"_nN " << _nN[2]<< "\n"; std::cout<<"_nHowMany " << _nHowMany<< "\n"; std::cout<<"_nIstride " << _nIstride<< "\n"; std::cout<<"_nIdist " << _nIdist<< "\n"; */ //setup the plan _plan=fftw_plan_many_dft(_nRank, _nN, _nHowMany, _dVal, _nInembed, _nIstride, _nIdist, _dVal, _nOnembed, _nOstride, _nOdist, nSign, FFTW_ESTIMATE); _bPlanSet=true; }