struct ath_3d_fft_plan *ath_3d_fft_create_plan(DomainS *pD, int gnx3, int gnx2, int gnx1, int gks, int gke, int gjs, int gje, int gis, int gie, ath_fft_data *data, int al, ath_fft_direction dir) { int nbuf, tmp; struct ath_3d_fft_plan *ath_plan; if ((dir != ATH_FFT_FORWARD) && (dir != ATH_FFT_BACKWARD)) { ath_error("Invalid Athena FFT direction.\n"); } /* Allocate memory for the plan */ ath_plan = (struct ath_3d_fft_plan *)malloc(sizeof(struct ath_3d_fft_plan)); if (ath_plan == NULL) { ath_error("[ath_3d_fft_plan] Couldn't malloc for FFT plan.\n"); } /* Set forward/backward FFT */ ath_plan->dir = dir; /* Set element count (for easy malloc and memset) */ ath_plan->cnt = (gke-gks+1)*(gje-gjs+1)*(gie-gis+1); ath_plan->gcnt = gnx3*gnx2*gnx1; tmp = (al==0 ? 1 : 0); if (data != NULL) tmp = 0; /* If data == NULL, then allocate something (temporarily if tmp=1) */ if (data == NULL) data = (ath_fft_data *)ath_3d_fft_malloc(ath_plan); if (data == NULL) ath_error("[ath_3d_fft_plan] Couln't malloc for FFT plan data.\n"); /* Create the plan */ #ifdef FFT_BLOCK_DECOMP /* Block decomp library plans don't care if forward or backward */ ath_plan->plan = fft_3d_create_plan(pD->Comm_Domain, gnx3, gnx2, gnx1, gks, gke, gjs, gje, gis, gie, gks, gke, gjs, gje, gis, gie, 0, 0, &nbuf); #else /* FFT_BLOCK_DECOMP */ if (dir == ATH_FFT_FORWARD) { ath_plan->plan = fftw_plan_dft_3d(gnx1, gnx2, gnx3, data, data, FFTW_FORWARD, FFTW_MEASURE); } else { ath_plan->plan = fftw_plan_dft_3d(gnx1, gnx2, gnx3, data, data, FFTW_BACKWARD, FFTW_MEASURE); } #endif /* FFT_BLOCK_DECOMP */ if (tmp) ath_3d_fft_free(data); return ath_plan; }
int main(void) { fftw_complex in[N0][N1][N2], out[N0][N1][N2], out2[N0][N1][N2]; /* double [2] */ fftw_plan p; int i0, i1, i2; p = fftw_plan_dft_3d(N0, N1, N2, &in[0][0][0], &out[0][0][0], FFTW_FORWARD, FFTW_ESTIMATE); for (i0 = 0; i0 < N0; i0++) for (i1 = 0; i1 < N1; i1++) for (i2 = 0; i2 < N2; i2++) { in[i0][i1][i2][0] = sin(2*M_PI*i0/N0) + 3*cos(6*M_PI*i1/N1); in[i0][i1][i2][1] = 5*cos(4*M_PI*i2/N2); } fftw_execute(p); ft3d(N0, N1, N2, &in[0][0][0], &out2[0][0][0]); for (i0 = 0; i0 < N0; i0++) for (i1 = 0; i1 < N1; i1++) for (i2 = 0; i2 < N2; i2++) if ( fabs(out[i0][i1][i2][0]) > 1e-3 || fabs(out[i0][i1][i2][1]) > 1e-3 || fabs(out2[i0][i1][i2][0]) > 1e-3 || fabs(out2[i0][i1][i2][1]) > 1e-3 ) printf("%6d %6d %6d: %20.10f %20.10f | %20.10f %20.10f\n", i0, i1, i2, out[i0][i1][i2][0], out[i0][i1][i2][1], out2[i0][i1][i2][0], out2[i0][i1][i2][1]); fftw_destroy_plan(p); fftw_cleanup(); return 0; }
PetscErrorCode MatApply_USFFT_Private(Mat A, fftw_plan *plan, int direction, Vec x,Vec y) { #if 0 PetscErrorCode ierr; PetscScalar *r_array, *y_array; Mat_USFFT* = (Mat_USFFT*)(A->data); #endif PetscFunctionBegin; #if 0 /* resample x to usfft->resample */ ierr = MatResample_USFFT_Private(A, x);CHKERRQ(ierr); /* NB: for now we use outdim for both x and y; this will change once a full USFFT is implemented */ ierr = VecGetArray(usfft->resample,&r_array);CHKERRQ(ierr); ierr = VecGetArray(y,&y_array);CHKERRQ(ierr); if (!*plan) { /* create a plan then execute it*/ if (usfft->dof == 1) { #if defined(PETSC_DEBUG_USFFT) ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "direction = %d, usfft->ndim = %d\n", direction, usfft->ndim);CHKERRQ(ierr); for (int ii = 0; ii < usfft->ndim; ++ii) { ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "usfft->outdim[%d] = %d\n", ii, usfft->outdim[ii]);CHKERRQ(ierr); } #endif switch (usfft->dim) { case 1: *plan = fftw_plan_dft_1d(usfft->outdim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; case 2: *plan = fftw_plan_dft_2d(usfft->outdim[0],usfft->outdim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; case 3: *plan = fftw_plan_dft_3d(usfft->outdim[0],usfft->outdim[1],usfft->outdim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; default: *plan = fftw_plan_dft(usfft->ndim,usfft->outdim,(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag); break; } fftw_execute(*plan); } /* if (dof == 1) */ else { /* if (dof > 1) */ *plan = fftw_plan_many_dft(/*rank*/usfft->ndim, /*n*/usfft->outdim, /*howmany*/usfft->dof, (fftw_complex*)x_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1, (fftw_complex*)y_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1, /*sign*/direction, /*flags*/usfft->p_flag); fftw_execute(*plan); } /* if (dof > 1) */ } /* if (!*plan) */ else { /* if (*plan) */ /* use existing plan */ fftw_execute_dft(*plan,(fftw_complex*)x_array,(fftw_complex*)y_array); } ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr); ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr); #endif PetscFunctionReturn(0); } /* MatApply_USFFT_Private() */
void MPC::compute_g_G(double &g_0, vector<double> &g_G, int N) { double L = PtclRef->Lattice.WignerSeitzRadius; double Linv = 1.0/L; double Linv3 = Linv*Linv*Linv; // create an FFTW plan Array<complex<double>,3> rBox(N,N,N); Array<complex<double>,3> GBox(N,N,N); // app_log() << "Doing " << N << " x " << N << " x " << N << " FFT.\n"; //create BC handler DTD_BConds<double,3,SUPERCELL_BULK> mybc(PtclRef->Lattice); // Fill the real-space array with f(r) double Ninv = 1.0/(double)N; TinyVector<double,3> u, r; for (int ix=0; ix<N; ix++) { u[0] = Ninv*ix; for (int iy=0; iy<N; iy++) { u[1] = Ninv*iy; for (int iz=0; iz<N; iz++) { u[2] = Ninv*iz; r = PtclRef->Lattice.toCart (u); //DTD_BConds<double,3,SUPERCELL_BULK>::apply (PtclRef->Lattice, r); //double rmag = std::sqrt(dot(r,r)); double rmag = std::sqrt(mybc.apply_bc(r)); if (rmag < L) rBox(ix,iy,iz) = -0.5*rmag*rmag*Linv3 + 1.5*Linv; else rBox(ix,iy,iz) = 1.0/rmag; } } } fftw_plan fft = fftw_plan_dft_3d (N, N, N, (fftw_complex*)rBox.data(), (fftw_complex*) GBox.data(), 1, FFTW_ESTIMATE); fftw_execute (fft); fftw_destroy_plan (fft); // Now, copy data into output, and add on analytic part double norm = Ninv*Ninv*Ninv; int numG = Gints.size(); for (int iG=0; iG < numG; iG++) { TinyVector<int,OHMMS_DIM> gint = Gints[iG]; for (int j=0; j<OHMMS_DIM; j++) gint[j] = (gint[j] + N)%N; g_G[iG] = norm * real(GBox(gint[0], gint[1], gint[2])); } g_0 = norm * real(GBox(0,0,0)); }
void fft3dCPU(T1* d_data, int nx, int ny, int nz) { cout << "Running forward xform 3d" << endl; fftw_plan plan; plan = fftw_plan_dft_3d(nz, ny, nx, (fftw_complex*) d_data, (fftw_complex*) d_data, FFTW_FORWARD, FFTW_ESTIMATE); // Inverse transform 'gridData_d' in place. fftw_execute(plan); fftw_destroy_plan(plan); }
convolution_plan::convolution_plan(int width, int height, int depth, int kw, int mode, int threadMaxCount) { switch (mode) { case 0: this->width = width; this->height = height; this->depth = depth; break; case 1: this->width = width + kw - 1; this->height = height + kw - 1; this->depth = depth + kw - 1; break; default: std::cout << mode << std::endl; throw std::invalid_argument("Warning: 3d convolution plan: Invalid mode"); } if (threadMaxCount > 1) { fftw_init_threads(); // This MUST come before all other fftw calls fftw_plan_with_nthreads(threadMaxCount); } this->dim = 3; this->kw = kw; this->threadMaxCount = threadMaxCount; fftw_complex* benchmarkArray1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * this->width * this->height * this->depth); fftw_complex* benchmarkArray2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * this->width * this->height * this->depth); this->forwardPlan = fftw_plan_dft_3d(this->depth, this->height, this->width, benchmarkArray1, benchmarkArray2, FFTW_FORWARD, FFTW_MEASURE); this->backwardPlan = fftw_plan_dft_3d(this->depth, this->height, this->width, benchmarkArray1, benchmarkArray2, FFTW_BACKWARD, FFTW_MEASURE); fftw_free(benchmarkArray1); fftw_free(benchmarkArray2); this->staticKernel = NULL; }
void fft3d(fftw_complex * out, double * k, fftw_complex * in, int * n, double * delta) { fftw_plan plan; plan = fftw_plan_dft_3d(n[0], n[1], n[2], in, out, FFTW_FORWARD, FFTW_ESTIMATE); k[0] = 2 * M_PI / (n[0] * delta[0]); k[1] = 2 * M_PI / (n[1] * delta[1]); k[2] = 2 * M_PI / (n[2] * delta[2]); fftw_execute(plan); fftw_destroy_plan(plan); for (int i = 0; i < n[0]*n[1]*n[2]; i++) { out[i] /= (n[0]*n[1]*n[2]); } }
WGSLIB_DECL bool fft_varmap_3d( VarOut& Yout, const std::vector<double>& data, const std::vector<int>& has_point, int N, int M, int K) { omp_set_num_threads(fft_get_num_threads()); fftw_plan_with_nthreads(fft_get_num_threads()); fftw_complex *z, *Z, *ZI; fftw_complex *z2, *Z2; fftw_complex *ni, *NI, *INI; fftw_plan p; int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1); fft_lock(); z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); ZI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); ni = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); NI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); INI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); fft_unlock(); #pragma omp parallel for for (int i = 0; i < ZS; ++i) { z[i][REAL] = z[i][IMAG] = 0; z2[i][REAL] = z2[i][IMAG] = 0; ni[i][REAL] = ni[i][IMAG] = 0; } #pragma omp parallel for for (int i = 0; i < N; ++i) { for (int j = 0; j < M; ++j) { for (int k = 0; k < K; ++k) { double v = data _p(i, j, k); int hp = has_point _p(i, j, k); ni _p3(i, j, k)[REAL] = hp; if (hp) { z _p3(i, j, k)[REAL] = v; z2 _p3(i, j, k)[REAL] = v * v; } } } } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z, Z, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2, Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, ni, NI, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int h = 0; h < ZS; ++h) { fftw_complex Z2_I; fftw_complex Z_Z; fftw_complex I_Z2; fftw_complex I_I; mul_conj(Z2_I, Z2[h], NI[h]); mul_conj(Z_Z, Z[h], Z[h]); mul_conj(I_Z2, NI[h], Z2[h]); mul_conj(I_I, NI[h], NI[h]); Z[h][REAL] = Z2_I[REAL] - 2 * Z_Z[REAL] + I_Z2[REAL]; Z[h][IMAG] = Z2_I[IMAG] - 2 * Z_Z[IMAG] + I_Z2[IMAG]; NI[h][REAL] = I_I[REAL]; NI[h][IMAG] = I_I[IMAG]; } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z, ZI, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, NI, INI, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int hx = 0; hx < N; ++hx) { for (int hy = 0; hy < M; ++hy) { for (int hz = 0; hz < K; ++hz) { double Y = ZI _p3(hx, hy, hz)[REAL] / ZS; double N = INI _p3(hx, hy, hz)[REAL] / ZS; if (N > 0.01) { Yout.varmap _p(hx, hy, hz) = Y / (2 * N); } else { Yout.varmap _p(hx, hy, hz) = 0; } Yout.ni _p(hx, hy, hz) = N; } } } fft_lock(); fftw_destroy_plan(p); fftw_free(z); fftw_free(Z); fftw_free(ZI); fftw_free(z2); fftw_free(Z2); fftw_free(ni); fftw_free(NI); fftw_free(INI); fft_unlock(); return true; }
WGSLIB_DECL bool fft_crossvarmap_3d_declus( VarOut& Yout, const std::vector<double>& weigth1, const std::vector<double>& data1, const std::vector<int>& has_point1, const std::vector<double>& data2, const std::vector<int>& has_point2, int N, int M, int K) { omp_set_num_threads(fft_get_num_threads()); fftw_plan_with_nthreads(fft_get_num_threads()); fftw_complex *i1i2, *I1I2; fftw_complex *z1i2, *Z1I2; fftw_complex *i1z2, *I1Z2; fftw_complex *z1z2, *Z1Z2; fftw_complex *w, *W; fftw_complex *wz1, *WZ1; fftw_complex *wz2, *WZ2; fftw_complex *wz1z2, *WZ1Z2; fftw_plan p; int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1); fft_lock(); i1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); i1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); wz1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); WZ1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); wz2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); WZ2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); wz1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); WZ1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); fft_unlock(); #pragma omp parallel for for (int i = 0; i < ZS; ++i) { i1i2[i][REAL] = i1i2[i][IMAG] = 0; I1I2[i][REAL] = I1I2[i][IMAG] = 0; z1i2[i][REAL] = z1i2[i][IMAG] = 0; Z1I2[i][REAL] = Z1I2[i][IMAG] = 0; i1z2[i][REAL] = i1z2[i][IMAG] = 0; I1Z2[i][REAL] = I1Z2[i][IMAG] = 0; z1z2[i][REAL] = z1z2[i][IMAG] = 0; Z1Z2[i][REAL] = Z1Z2[i][IMAG] = 0; w[i][REAL] = w[i][IMAG] = 0; W[i][REAL] = W[i][IMAG] = 0; wz1[i][REAL] = wz1[i][IMAG] = 0; WZ1[i][REAL] = WZ1[i][IMAG] = 0; wz2[i][REAL] = wz2[i][IMAG] = 0; WZ2[i][REAL] = WZ2[i][IMAG] = 0; wz1z2[i][REAL] = wz1z2[i][IMAG] = 0; WZ1Z2[i][REAL] = WZ1Z2[i][IMAG] = 0; } #pragma omp parallel for for (int i = 0; i < N; ++i) { for (int j = 0; j < M; ++j) { for (int k = 0; k < K; ++k) { double v1 = data1 _p(i, j, k); double w1_ = weigth1 _p(i, j, k); int i1 = has_point1 _p(i, j, k); double v2 = data2 _p(i, j, k); int i2 = has_point2 _p(i, j, k); i1i2 _p3(i, j, k)[REAL] = i1 * i2; z1i2 _p3(i, j, k)[REAL] = v1 * i2; i1z2 _p3(i, j, k)[REAL] = i1 * v2; z1z2 _p3(i, j, k)[REAL] = v1 * v2; w _p3(i, j, k)[REAL] = w1_; wz1 _p3(i, j, k)[REAL] = w1_ * v1; wz2 _p3(i, j, k)[REAL] = w1_ * v2; wz1z2 _p3(i, j, k)[REAL] = w1_ * v1 * v2; } } } /////////////////////////////////////////////////////////////////// fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1i2, I1I2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1i2, Z1I2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1z2, I1Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1z2, Z1Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); //////////////////////////////////////////////////////////////////////////////////// fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, w, W, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz1, WZ1, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz2, WZ2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz1z2, WZ1Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int h = 0; h < ZS; ++h) { fftw_complex W_I1I2; fftw_complex I1I2_W; fftw_complex WZ1Z2_I1I2; fftw_complex I1I2_WZ1Z2; fftw_complex WZ2_Z1I2; fftw_complex Z1I2_WZ2; fftw_complex WZ1_I1Z2; fftw_complex I1Z2_WZ1; fftw_complex W_Z1Z2; fftw_complex Z1Z2_W; mul_conj(W_I1I2, W[h], I1I2[h]); mul_conj(I1I2_W, I1I2[h], W[h]); mul_conj(WZ1Z2_I1I2, WZ1Z2[h], I1I2[h]); mul_conj(I1I2_WZ1Z2, I1I2[h], WZ1Z2[h]); mul_conj(WZ2_Z1I2, WZ2[h], Z1I2[h]); mul_conj(Z1I2_WZ2, Z1I2[h], WZ2[h]); mul_conj(WZ1_I1Z2, WZ1[h], I1Z2[h]); mul_conj(I1Z2_WZ1, I1Z2[h], WZ1[h]); mul_conj(W_Z1Z2, W[h], Z1Z2[h]); mul_conj(Z1Z2_W, Z1Z2[h], W[h]); Z1Z2[h][REAL] = WZ1Z2_I1I2[REAL] - WZ2_Z1I2[REAL] - WZ1_I1Z2[REAL] + W_Z1Z2[REAL] + Z1Z2_W[REAL] - I1Z2_WZ1[REAL] - Z1I2_WZ2[REAL] + I1I2_WZ1Z2[REAL]; Z1Z2[h][IMAG] = WZ1Z2_I1I2[IMAG] - WZ2_Z1I2[IMAG] - WZ1_I1Z2[IMAG] + W_Z1Z2[IMAG] + Z1Z2_W[IMAG] - I1Z2_WZ1[IMAG] - Z1I2_WZ2[IMAG] + I1I2_WZ1Z2[IMAG]; W[h][REAL] = 2 * (W_I1I2[REAL] + I1I2_W[REAL]); W[h][IMAG] = 2 * (W_I1I2[IMAG] + I1I2_W[IMAG]); } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z1Z2, z1z2, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, W, w, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int hx = 0; hx < N; ++hx) { for (int hy = 0; hy < M; ++hy) { for (int hz = 0; hz < K; ++hz) { double Y = z1z2 _p3(hx, hy, hz)[REAL] / ZS; double N = w _p3(hx, hy, hz)[REAL] / ZS; if (N > 0.01) { Yout.varmap _p(hx, hy, hz) = Y / (N); } else { Yout.varmap _p(hx, hy, hz) = 0; } Yout.ni _p(hx, hy, hz) = N; } } } fft_lock(); fftw_destroy_plan(p); fftw_free(i1i2); fftw_free(I1I2); fftw_free(z1i2); fftw_free(Z1I2); fftw_free(i1z2); fftw_free(I1Z2); fftw_free(z1z2); fftw_free(Z1Z2); fftw_free(w); fftw_free(W); fftw_free(wz1); fftw_free(WZ1); fftw_free(wz2); fftw_free(WZ2); fftw_free(wz1z2); fftw_free(WZ1Z2); fft_unlock(); return true; }
WGSLIB_DECL bool fft_crossvarmap_3d( VarOut& Yout, const std::vector<double>& data1, const std::vector<int>& has_point1, const std::vector<double>& data2, const std::vector<int>& has_point2, int N, int M, int K) { omp_set_num_threads(fft_get_num_threads()); fftw_plan_with_nthreads(fft_get_num_threads()); fftw_complex *Z1I; fftw_complex *I1I; fftw_complex *z1z2, *Z1Z2; fftw_complex *i2z1, *I2Z1; fftw_complex *i1z2, *I1Z2; fftw_complex *i1i2, *I1I2; fftw_plan p; int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1); fft_lock(); Z1I = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I1I = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); i2z1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I2Z1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); i1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); i1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); I1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); fft_unlock(); #pragma omp parallel for for (int i = 0; i < ZS; ++i) { z1z2[i][REAL] = z1z2[i][IMAG] = 0; i1z2[i][REAL] = i1z2[i][IMAG] = 0; i2z1[i][REAL] = i2z1[i][IMAG] = 0; i1i2[i][REAL] = i1i2[i][IMAG] = 0; } #pragma omp parallel for for (int i = 0; i < N; ++i) { for (int j = 0; j < M; ++j) { for (int k = 0; k < K; ++k) { double v1 = data1 _p(i, j, k); double v2 = data2 _p(i, j, k); int hp1 = has_point1 _p(i, j, k); int hp2 = has_point2 _p(i, j, k); z1z2 _p3(i, j, k)[REAL] = v1 * v2; i1z2 _p3(i, j, k)[REAL] = hp1 * v2; i2z1 _p3(i, j, k)[REAL] = hp2 * v1; i1i2 _p3(i, j, k)[REAL] = hp1 * hp2; } } } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1z2, Z1Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1z2, I1Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i2z1, I2Z1, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1i2, I1I2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int h = 0; h < ZS; ++h) { fftw_complex A; fftw_complex B; fftw_complex C; fftw_complex D; fftw_complex I1_I2; mul_conj(A, Z1Z2[h], I1I2[h]); mul_conj(D, I1I2[h], Z1Z2[h]); mul_conj(B, I2Z1[h], I1Z2[h]); mul_conj(C, I1Z2[h], I2Z1[h]); mul_conj(I1_I2, I1I2[h], I1I2[h]); Z1Z2[h][REAL] = A[REAL] - B[REAL] - C[REAL] + D[REAL]; Z1Z2[h][IMAG] = A[IMAG] - B[IMAG] - C[IMAG] + D[IMAG]; I1I2[h][REAL] = I1_I2[REAL]; I1I2[h][IMAG] = I1_I2[IMAG]; } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z1Z2, Z1I, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, I1I2, I1I, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int hx = 0; hx < N; ++hx) { for (int hy = 0; hy < M; ++hy) { for (int hz = 0; hz < K; ++hz) { double Y = Z1I _p3(hx, hy, hz)[REAL] / ZS; double N = I1I _p3(hx, hy, hz)[REAL] / ZS; if (N > 0.01) { Yout.varmap _p(hx, hy, hz) = Y / (2 * N); } else { Yout.varmap _p(hx, hy, hz) = 0; } Yout.ni _p(hx, hy, hz) = N; } } } fft_lock(); fftw_destroy_plan(p); fftw_free(Z1I); fftw_free(I1I); fftw_free(z1z2); fftw_free(Z1Z2); fftw_free(i2z1); fftw_free(I2Z1); fftw_free(i1z2); fftw_free(I1Z2); fftw_free(i1i2); fftw_free(I1I2); fft_unlock(); return true; }
WGSLIB_DECL bool fft_varmap_3d_declus( VarOut& Yout, const std::vector<double>& data, const std::vector<double>& weigth, const std::vector<int>& has_point, int N, int M, int K) { omp_set_num_threads(fft_get_num_threads()); fftw_plan_with_nthreads(fft_get_num_threads()); fftw_complex *z, *Z, *ZI; fftw_complex *z2, *Z2; fftw_complex *ni, *NI, *INI; fftw_complex *w, *W; fftw_complex *z2w, *Z2W; fftw_complex *zw, *ZW; fftw_plan p; int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1); fft_lock(); z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); ZI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); ni = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); NI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); INI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); z2w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); Z2W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); zw = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); ZW = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); fft_unlock(); #pragma omp parallel for for (int i = 0; i < ZS; ++i) { z[i][REAL] = z[i][IMAG] = 0; z2[i][REAL] = z2[i][IMAG] = 0; ni[i][REAL] = ni[i][IMAG] = 0; w[i][REAL] = w[i][IMAG] = 0; z2w[i][REAL] = z2w[i][IMAG] = 0; zw[i][REAL] = zw[i][IMAG] = 0; } #pragma omp parallel for for (int i = 0; i < N; ++i) { for (int j = 0; j < M; ++j) { for (int k = 0; k < K; ++k) { double v = data _p(i, j, k); double w_ = weigth _p(i, j, k); int hp = has_point _p(i, j, k); ni _p3(i, j, k)[REAL] = hp; if (hp) { z _p3(i, j, k)[REAL] = v; z2 _p3(i, j, k)[REAL] = v * v; zw _p3(i, j, k)[REAL] = v * w_; z2w _p3(i, j, k)[REAL] = v * v * w_; w _p3(i, j, k)[REAL] = w_; } } } } //LEVA z para o dominio da frequencia fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z, Z, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, w, W, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, zw, ZW, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2w, Z2W, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2, Z2, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, ni, NI, FFTW_FORWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); //Realiza convolução #pragma omp parallel for for (int h = 0; h < ZS; ++h) { fftw_complex Z2W_I; fftw_complex ZW_Z; fftw_complex W_Z2; fftw_complex Z2_W; fftw_complex Z_ZW; fftw_complex I_Z2W; fftw_complex W_I; fftw_complex I_W; mul_conj(Z2W_I, Z2W[h], NI[h]); mul_conj(ZW_Z, ZW[h], Z[h]); mul_conj(W_Z2, W[h], Z2[h]); mul_conj(Z2_W, Z2[h], W[h]); mul_conj(Z_ZW, Z[h], ZW[h]); mul_conj(I_Z2W, NI[h], Z2W[h]); mul_conj(I_W, NI[h], W[h]); mul_conj(W_I, W[h], NI[h]); Z[h][REAL] = Z2W_I[REAL] - 2 * ZW_Z[REAL] + W_Z2[REAL] + Z2_W[REAL] - 2 * Z_ZW[REAL] + I_Z2W[REAL]; Z[h][IMAG] = Z2W_I[IMAG] - 2 * ZW_Z[IMAG] + W_Z2[IMAG] + Z2_W[IMAG] - 2 * Z_ZW[IMAG] + I_Z2W[IMAG]; NI[h][REAL] = 2 * (W_I[REAL] + I_W[REAL]); NI[h][IMAG] = 2 * (W_I[IMAG] + I_W[IMAG]); } fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z, ZI, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); fft_lock(); p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, NI, INI, FFTW_BACKWARD, FFTW_ESTIMATE); fft_unlock(); fftw_execute(p); #pragma omp parallel for for (int hx = 0; hx < N; ++hx) { for (int hy = 0; hy < M; ++hy) { for (int hz = 0; hz < K; ++hz) { double Y = ZI _p3(hx, hy, hz)[REAL] / ZS; double N = INI _p3(hx, hy, hz)[REAL] / ZS; if (N > 0.01) { Yout.varmap _p(hx, hy, hz) = Y / N; } else { Yout.varmap _p(hx, hy, hz) = 0; } Yout.ni _p(hx, hy, hz) = N; } } } fft_lock(); fftw_destroy_plan(p); fftw_free(z); fftw_free(Z); fftw_free(ZI); fftw_free(z2); fftw_free(Z2); fftw_free(zw); fftw_free(ZW); fftw_free(z2w); fftw_free(Z2W); fftw_free(w); fftw_free(W); fftw_free(ni); fftw_free(NI); fftw_free(INI); fft_unlock(); return true; }
void MPC::init_spline() { Array<complex<double>,3> rBox(SplineDim[0], SplineDim[1], SplineDim[2]), GBox(SplineDim[0], SplineDim[1], SplineDim[2]); Array<double,3> splineData(SplineDim[0], SplineDim[1], SplineDim[2]); GBox = complex<double>(); Vconst = 0.0; // Now fill in elements of GBox double vol = PtclRef->Lattice.Volume; double volInv = 1.0/vol; for (int iG=0; iG < Gvecs.size(); iG++) { TinyVector<int,OHMMS_DIM> gint = Gints[iG]; PosType G = Gvecs[iG]; double G2 = dot(G,G); TinyVector<int,OHMMS_DIM> index; for (int j=0; j<OHMMS_DIM; j++) index[j] = (gint[j] + SplineDim[j]) % SplineDim[j]; if (!(index[0]==0 && index[1]==0 && index[2]==0)) { GBox(index[0], index[1], index[2]) = vol * Rho_G[iG] * (4.0*M_PI*volInv/G2 - f_G[iG]); Vconst -= 0.5 * vol * vol * norm(Rho_G[iG]) * (4.0*M_PI*volInv/G2 - f_G[iG]); } } // G=0 component calculated seperately GBox(0,0,0) = -vol * f_0 * Rho_G[0]; Vconst += 0.5 * vol * vol * f_0 * norm(Rho_G[0]); app_log() << " Constant potential = " << Vconst << endl; fftw_plan fft = fftw_plan_dft_3d (SplineDim[0], SplineDim[1], SplineDim[2], (fftw_complex*)GBox.data(), (fftw_complex*) rBox.data(), -1, FFTW_ESTIMATE); fftw_execute (fft); fftw_destroy_plan (fft); for (int i0=0; i0<SplineDim[0]; i0++) for (int i1=0; i1<SplineDim[1]; i1++) for (int i2=0; i2<SplineDim[2]; i2++) splineData(i0, i1, i2) = real(rBox(i0,i1,i2)); BCtype_d bc0, bc1, bc2; Ugrid grid0, grid1, grid2; grid0.start=0.0; grid0.end=1.0; grid0.num = SplineDim[0]; grid1.start=0.0; grid1.end=1.0; grid1.num = SplineDim[1]; grid2.start=0.0; grid2.end=1.0; grid2.num = SplineDim[2]; bc0.lCode = bc0.rCode = PERIODIC; bc1.lCode = bc1.rCode = PERIODIC; bc2.lCode = bc2.rCode = PERIODIC; VlongSpline = create_UBspline_3d_d (grid0, grid1, grid2, bc0, bc1, bc2, splineData.data()); // grid0.num = PtclRef->Density_r.size(0); // grid1.num = PtclRef->Density_r.size(1); // grid2.num = PtclRef->Density_r.size(2); // DensitySpline = create_UBspline_3d_d (grid0, grid1, grid2, bc0, bc1, bc2, // PtclRef->Density_r.data()); }
/* * * To use FFTW in parallel, the lattice needs to be redistributed; * this Fourier-transform function takes a number (n) of 3D blocks; * the n blocks shall be destributed over as many prosesses as possible; * then the parallel FFTW is called, destributing the z-axis over remaining processes * */ void qpb_ft(qpb_complex **out, qpb_complex **in, int n, int mom[][4], int nmom) { int rank = problem_params.proc_id; int nprocs = problem_params.nprocs; int Lz = problem_params.g_dim[1]; int Ly = problem_params.g_dim[2]; int Lx = problem_params.g_dim[3]; int lz = problem_params.l_dim[1]; int ly = problem_params.l_dim[2]; int lx = problem_params.l_dim[3]; int vol3d = Lx*Ly*Lz; int lvol3d = lx*ly*lz; MPI_Comm comm_cart = problem_params.mpi_comm_cart; int nprocs_n = 0; for(int i=1; i<nprocs+1; i++) if((n % i) == 0) nprocs_n = i; int n_loc = n / nprocs_n; fftw_complex *corr[n_loc]; qpb_complex *swap = NULL; if(rank < nprocs_n) for(int i=0; i<n_loc; i++) corr[i] = fftw_malloc(sizeof(fftw_complex)*vol3d); if(rank < nprocs_n) swap = qpb_alloc(sizeof(qpb_complex)*vol3d); for(int i=0; i<n_loc; i++) for(int j=0; j<nprocs_n; j++) { MPI_Gather(in[i+j*n_loc], lvol3d*sizeof(qpb_complex), MPI_BYTE, swap, lvol3d*sizeof(qpb_complex), MPI_BYTE, j, MPI_COMM_WORLD); if(rank == j) { for(int p=0; p<nprocs; p++) { int coords[ND-1]; qpb_complex *ptr = swap + p*lvol3d; MPI_Cart_coords(comm_cart, p, ND-1, coords); int zoff = coords[0]*lz; int yoff = coords[1]*ly; int xoff = coords[2]*lx; for(int z=zoff; z<lz+zoff; z++) for(int y=yoff; y<ly+yoff; y++) for(int x=xoff; x<lx+xoff; x++) { corr[i][x + y*Lx + z*Lx*Ly][0] = ptr->re; corr[i][x + y*Lx + z*Lx*Ly][1] = ptr->im; ptr++; } } } MPI_Barrier(MPI_COMM_WORLD); } if(rank < nprocs_n) for(int i=0; i<n_loc; i++) { fftw_plan plan = fftw_plan_dft_3d(Lz, Ly, Lx, corr[i], corr[i], FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); } if(rank < nprocs_n) for(int i=0; i<n_loc; i++) for(int p=0; p<nmom; p++) { int kx = (Lx + mom[p][3]) % Lx; int ky = (Ly + mom[p][2]) % Ly; int kz = (Lz + mom[p][1]) % Lz; out[i][p] = (qpb_complex){corr[i][kx + ky*Lx + kz*Lx*Ly][0], corr[i][kx + ky*Lx + kz*Lx*Ly][1]}; } for(int p=1; p<nprocs_n; p++) for(int i=0; i<n_loc; i++) { if(rank == 0) MPI_Recv(out[p*n_loc+i], nmom*sizeof(qpb_complex), MPI_BYTE, p, p, MPI_COMM_WORLD, MPI_STATUS_IGNORE); if(rank == p) MPI_Send(out[i], nmom*sizeof(qpb_complex), MPI_BYTE, 0, p, MPI_COMM_WORLD); } if(rank < nprocs_n) { for(int i=0; i<n_loc; i++) fftw_free(corr[i]); free(swap); } return; }
bool c_FourierTransfrom::ifftw_complex_3d(const Mat_<Vec6d> &_input, Mat_<Vec6d> &_output) { size_t height = _input.rows; size_t width = _input.cols; size_t n_channels = _input.channels() / 2; size_t n_pixels = height * width; size_t n_data = n_pixels * n_channels; fftw_complex *in, *out; fftw_plan p; in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * n_data); out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * n_data); p = fftw_plan_dft_3d(height, width, n_channels, in, out, FFTW_BACKWARD, FFTW_ESTIMATE); /*!< prepare the data */ for (size_t i_row = 0; i_row < height; ++i_row) { const Vec3d *p = _input.ptr<Vec3d>(i_row); for (size_t i_col = 0; i_col < width; ++i_col) { size_t index = i_row * width + i_col; for (size_t k = 0; k < n_channels; ++k) { in[n_pixels * k + index][0] = p[i_col][k]; in[n_pixels * k + index][1] = p[i_col][k + n_channels]; } #if 0 in[index][0] = p[i_col][4]; in[index][1] = p[i_col][5]; in[n_pixels + index][0] = p[i_col][2]; in[n_pixels + index][1] = p[i_col][3]; in[n_pixels * 2 + index][0] = p[i_col][0]; in[n_pixels * 2 + index][1] = p[i_col][1]; #endif } } fftw_execute(p); /*!< write back data */ _output = Mat_<Vec6d>::zeros(_input.size()); for (size_t i_row = 0; i_row < height; ++i_row) { Vec6d *p = _output.ptr<Vec6d>(i_row); for (size_t i_col = 0; i_col < width; ++i_col) { size_t index = i_row * width + i_col; for (size_t k = 0; k < n_channels; ++k) { p[i_col][k] = out[n_pixels * k + index][0]; p[i_col][k + n_channels] = out[n_pixels * k + index][1]; } #if 0 p[i_col][0] = out[n_pixels * 2 + index][0]; p[i_col][1] = out[n_pixels + index][0]; p[i_col][2] = out[index][0]; p[i_col][3] = out[n_pixels * 2 + index][1]; p[i_col][4] = out[n_pixels + index][1]; p[i_col][5] = out[index][1]; #endif } } _output /= n_data; fftw_destroy_plan(p); fftw_free(in); fftw_free(out); return true; }
void EinsplineSetBuilder::ReadBands_ESHDF(int spin, EinsplineSetExtended<double>* orbitalSet) { update_token(__FILE__,__LINE__,"ReadBands_ESHDF:double"); ReportEngine PRE("EinsplineSetBuilder","ReadBands_ESHDF(EinsplineSetExtended<double>*"); vector<AtomicOrbital<double> > realOrbs(AtomicOrbitals.size()); for (int iat=0; iat<realOrbs.size(); iat++) { AtomicOrbital<complex<double> > &corb (AtomicOrbitals[iat]); realOrbs[iat].set_pos (corb.Pos); realOrbs[iat].set_lmax (corb.lMax); realOrbs[iat].set_cutoff (corb.CutoffRadius); realOrbs[iat].set_spline (corb.SplineRadius, corb.SplinePoints); realOrbs[iat].set_polynomial (corb.PolyRadius, corb.PolyOrder); realOrbs[iat].Lattice = corb.Lattice; } bool root = myComm->rank()==0; // bcast other stuff myComm->bcast (NumDistinctOrbitals); myComm->bcast (NumValenceOrbs); myComm->bcast (NumCoreOrbs); int N = NumDistinctOrbitals; orbitalSet->kPoints.resize(N); orbitalSet->MakeTwoCopies.resize(N); orbitalSet->StorageValueVector.resize(N); orbitalSet->BlendValueVector.resize(N); orbitalSet->StorageLaplVector.resize(N); orbitalSet->BlendLaplVector.resize(N); orbitalSet->StorageGradVector.resize(N); orbitalSet->BlendGradVector.resize(N); orbitalSet->StorageHessVector.resize(N); orbitalSet->StorageGradHessVector.resize(N); orbitalSet->phase.resize(N); orbitalSet->eikr.resize(N); orbitalSet->NumValenceOrbs = NumValenceOrbs; orbitalSet->NumCoreOrbs = NumCoreOrbs; orbitalSet->FirstOrderSplines.resize(IonPos.size()); // Read in k-points int numOrbs = orbitalSet->getOrbitalSetSize(); int num = 0; vector<BandInfo>& SortBands(*FullBands[spin]); if (root) { for (int iorb=0; iorb<N; iorb++) { int ti = SortBands[iorb].TwistIndex; PosType twist = TwistAngles[ti]; orbitalSet->kPoints[iorb] = orbitalSet->PrimLattice.k_cart(twist); orbitalSet->MakeTwoCopies[iorb] = (num < (numOrbs-1)) && SortBands[iorb].MakeTwoCopies; num += orbitalSet->MakeTwoCopies[iorb] ? 2 : 1; } PosType twist0 = TwistAngles[SortBands[0].TwistIndex]; for (int i=0; i<OHMMS_DIM; i++) if (std::fabs(std::fabs(twist0[i]) - 0.5) < 1.0e-8) orbitalSet->HalfG[i] = 1; else orbitalSet->HalfG[i] = 0; EinsplineSetBuilder::RotateBands_ESHDF(spin, orbitalSet); } myComm->bcast(orbitalSet->kPoints); myComm->bcast(orbitalSet->MakeTwoCopies); myComm->bcast(orbitalSet->HalfG); // First, check to see if we have already read this in H5OrbSet set(H5FileName, spin, N); bool havePsir=!ReadGvectors_ESHDF(); app_log() << "MeshSize = (" << MeshSize[0] << ", " << MeshSize[1] << ", " << MeshSize[2] << ")\n"; //int nx, ny, nz, bi, ti; int nx, ny, nz; nx=MeshSize[0]; ny=MeshSize[1]; nz=MeshSize[2]; Ugrid x_grid, y_grid, z_grid; BCtype_d xBC, yBC, zBC; if (orbitalSet->HalfG[0]) { xBC.lCode = ANTIPERIODIC; xBC.rCode = ANTIPERIODIC; } else { xBC.lCode = PERIODIC; xBC.rCode = PERIODIC; } if (orbitalSet->HalfG[1]) { yBC.lCode = ANTIPERIODIC; yBC.rCode = ANTIPERIODIC; } else { yBC.lCode = PERIODIC; yBC.rCode = PERIODIC; } if (orbitalSet->HalfG[2]) { zBC.lCode = ANTIPERIODIC; zBC.rCode = ANTIPERIODIC; } else { zBC.lCode = PERIODIC; zBC.rCode = PERIODIC; } x_grid.start = 0.0; x_grid.end = 1.0; x_grid.num = nx; y_grid.start = 0.0; y_grid.end = 1.0; y_grid.num = ny; z_grid.start = 0.0; z_grid.end = 1.0; z_grid.num = nz; // Create the multiUBspline object orbitalSet->MultiSpline = create_multi_UBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs); if (HaveOrbDerivs) { orbitalSet->FirstOrderSplines.resize(IonPos.size()); for (int ion=0; ion<IonPos.size(); ion++) for (int dir=0; dir<OHMMS_DIM; dir++) orbitalSet->FirstOrderSplines[ion][dir] = create_multi_UBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs); } ////////////////////////////////////// // Create the MuffinTin APW splines // ////////////////////////////////////// orbitalSet->MuffinTins.resize(NumMuffinTins); for (int tin=0; tin<NumMuffinTins; tin++) { orbitalSet->MuffinTins[tin].Atom = tin; orbitalSet->MuffinTins[tin].set_center (MT_centers[tin]); orbitalSet->MuffinTins[tin].set_lattice(Lattice); orbitalSet->MuffinTins[tin].init_APW (MT_APW_rgrids[tin], MT_APW_lmax[tin], NumValenceOrbs); } for (int iat=0; iat<realOrbs.size(); iat++) { realOrbs[iat].set_num_bands(NumValenceOrbs); realOrbs[iat].allocate(); } int isComplex; if (root) { HDFAttribIO<int> h_isComplex(isComplex); h_isComplex.read(H5FileID, "/electrons/psi_r_is_complex"); } myComm->bcast(isComplex); bool isCore = bcastSortBands(spin,N,root); if(isCore) { APP_ABORT("Core states not supported by ES-HDF yet."); } //this is common Array<double,3> splineData(nx,ny,nz); if(havePsir) { if(isComplex) { app_log() << " Reading complex psi_r and convert to real" << endl; Array<complex<double>,3> rawData; for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { int ti=SortBands[iorb].TwistIndex; if(root) { ostringstream path; path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r"; HDFAttribIO<Array<complex<double>,3> > h_splineData(rawData); h_splineData.read(H5FileID, path.str().c_str()); } myComm->bcast(rawData); //multiply twist factor and project on the real fix_phase_c2r(rawData,splineData,TwistAngles[ti]); set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data()); } } else { app_log() << " Reading real psi_r" << endl; for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { if(root) { ostringstream path; path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r"; HDFAttribIO<Array<double,3> > h_splineData(splineData); h_splineData.read(H5FileID, path.str().c_str()); } myComm->bcast(splineData); set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data()); } } } else { Array<ComplexType,3> FFTbox; FFTbox.resize(MeshSize[0], MeshSize[1], MeshSize[2]); fftw_plan FFTplan = fftw_plan_dft_3d (MeshSize[0], MeshSize[1], MeshSize[2], reinterpret_cast<fftw_complex*>(FFTbox.data()), reinterpret_cast<fftw_complex*>(FFTbox.data()), +1, FFTW_ESTIMATE); for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { Vector<complex<double> > cG; int ncg=0; int ti=SortBands[iorb].TwistIndex; if(root) { ostringstream path; path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_g"; HDFAttribIO<Vector<complex<double> > > h_cG(cG); h_cG.read (H5FileID, path.str().c_str()); ncg=cG.size(); } myComm->bcast(ncg); if(ncg != Gvecs[0].size()) { APP_ABORT("Failed : ncg != Gvecs[0].size()"); } if(!root) cG.resize(ncg); myComm->bcast(cG); unpack4fftw(cG,Gvecs[0],MeshSize,FFTbox); fftw_execute (FFTplan); fix_phase_rotate_c2r(FFTbox,splineData,TwistAngles[ti]); set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data()); } fftw_destroy_plan(FFTplan); } for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { // Read atomic orbital information for (int iat=0; iat<realOrbs.size(); iat++) { app_log() << "Reading orbital " << iat << " for band " << ival << endl; AtomicOrbital<double> &orb = realOrbs[iat]; //AtomicOrbital<complex<double> > &orb = realOrbs[iat]; Array<complex<double>,2> radial_spline(orb.SplinePoints,orb.Numlm), poly_coefs(orb.PolyOrder+1,orb.Numlm); int ti = SortBands[iorb].TwistIndex; if (root) { int bi = SortBands[iorb].BandIndex; ostringstream path; path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/"; ostringstream spline_path, poly_path; spline_path << path.str() << "radial_spline_" << iat; poly_path << path.str() << "poly_coefs_" << iat; HDFAttribIO<Array<complex<double>,2> > h_radial_spline(radial_spline); HDFAttribIO<Array<complex<double>,2> > h_poly_coefs(poly_coefs); h_radial_spline.read(H5FileID, spline_path.str().c_str()); h_poly_coefs.read (H5FileID, poly_path.str().c_str()); } myComm->bcast(radial_spline); myComm->bcast(poly_coefs); realOrbs[iat].set_band (ival, radial_spline, poly_coefs, TwistAngles[ti]); } } for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { // Now read muffin tin data for (int tin=0; tin<NumMuffinTins; tin++) { // app_log() << "Reading data for muffin tin " << tin << endl; PosType twist, k; int lmax = MT_APW_lmax[tin]; int numYlm = (lmax+1)*(lmax+1); Array<complex<double>,2> u_lm_r(numYlm, MT_APW_num_radial_points[tin]); Array<complex<double>,1> du_lm_dr (numYlm); int ti = SortBands[iorb].TwistIndex; if (root) { int bi = SortBands[iorb].BandIndex; twist = TwistAngles[ti]; k = orbitalSet->PrimLattice.k_cart(twist); string uName = MuffinTinPath (ti, bi,tin) + "u_lm_r"; string duName = MuffinTinPath (ti, bi,tin) + "du_lm_dr"; HDFAttribIO<Array<complex<double>,2> > h_u_lm_r(u_lm_r); HDFAttribIO<Array<complex<double>,1> > h_du_lm_dr(du_lm_dr); h_u_lm_r.read(H5FileID, uName.c_str()); h_du_lm_dr.read(H5FileID, duName.c_str()); } myComm->bcast(u_lm_r); myComm->bcast(du_lm_dr); myComm->bcast(k); double Z = (double)IonTypes(tin); OrbitalSet->MuffinTins[tin].set_APW (ival, k, u_lm_r, du_lm_dr, Z); } } //FIX HaveOrbDerivs after debugging // // Now read orbital derivatives if we have them // if (HaveOrbDerivs) { // for (int ion=0; ion<IonPos.size(); ion++) // for (int dim=0; dim<OHMMS_DIM; dim++) { // if (root) { // int ti = SortBands[iorb].TwistIndex; // int bi = SortBands[iorb].BandIndex; // // app_log() << "Reading orbital derivative for ion " << ion // << " dim " << dim << " spin " << spin << " band " // << bi << " kpoint " << ti << endl; // ostringstream path; // path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/" // << "dpsi_" << ion << "_" << dim << "_r"; // string psirName = path.str(); // if (isComplex) { // HDFAttribIO<Array<complex<double>,3> > h_rawData(rawData); // h_rawData.read(H5FileID, psirName.c_str()); // if ((rawData.size(0) != nx) || // (rawData.size(1) != ny) || // (rawData.size(2) != nz)) { // fprintf (stderr, "Error in EinsplineSetBuilder::ReadBands.\n"); // fprintf (stderr, "Extended orbitals should all have the same dimensions\n"); // abort(); // } //#pragma omp parallel for // for (int ix=0; ix<nx; ix++) { // PosType ru; // ru[0] = (RealType)ix / (RealType)nx; // for (int iy=0; iy<ny; iy++) { // ru[1] = (RealType)iy / (RealType)ny; // for (int iz=0; iz<nz; iz++) { // ru[2] = (RealType)iz / (RealType)nz; // double phi = -2.0*M_PI*dot (ru, TwistAngles[ti]); // double s, c; // sincos(phi, &s, &c); // complex<double> phase(c,s); // complex<double> z = phase*rawData(ix,iy,iz); // splineData(ix,iy,iz) = z.real(); // } // } // } // } // else { // HDFAttribIO<Array<double,3> > h_splineData(splineData); // h_splineData.read(H5FileID, psirName.c_str()); // if ((splineData.size(0) != nx) || // (splineData.size(1) != ny) || // (splineData.size(2) != nz)) { // fprintf (stderr, "Error in EinsplineSetBuilder::ReadBands.\n"); // fprintf (stderr, "Extended orbitals should all have the same dimensions\n"); // abort(); // } // } // } // myComm->bcast(splineData); // set_multi_UBspline_3d_d // (orbitalSet->FirstOrderSplines[ion][dim], ival, splineData.data()); // } // } // // // orbitalSet->AtomicOrbitals = realOrbs; for (int i=0; i<orbitalSet->AtomicOrbitals.size(); i++) orbitalSet->AtomicOrbitals[i].registerTimers(); //ExtendedMap_d[set] = orbitalSet->MultiSpline; }
void EinsplineSetBuilder::ReadBands_ESHDF(int spin, EinsplineSetExtended<complex<double > >* orbitalSet) { update_token(__FILE__,__LINE__,"ReadBands_ESHDF:complex"); ReportEngine PRE("EinsplineSetBuilder","ReadBands_ESHDF(EinsplineSetExtended<complex<double > >*"); Timer c_prep, c_unpack,c_fft, c_phase, c_spline, c_newphase, c_h5, c_init; double t_prep=0.0, t_unpack=0.0, t_fft=0.0, t_phase=0.0, t_spline=0.0, t_newphase=0.0, t_h5=0.0, t_init=0.0; c_prep.restart(); bool root = myComm->rank()==0; vector<BandInfo>& SortBands(*FullBands[spin]); // bcast other stuff myComm->bcast (NumDistinctOrbitals); myComm->bcast (NumValenceOrbs); myComm->bcast (NumCoreOrbs); int N = NumDistinctOrbitals; orbitalSet->kPoints.resize(N); orbitalSet->MakeTwoCopies.resize(N); orbitalSet->StorageValueVector.resize(N); orbitalSet->BlendValueVector.resize(N); orbitalSet->StorageLaplVector.resize(N); orbitalSet->BlendLaplVector.resize(N); orbitalSet->StorageGradVector.resize(N); orbitalSet->BlendGradVector.resize(N); orbitalSet->StorageHessVector.resize(N); orbitalSet->StorageGradHessVector.resize(N); orbitalSet->phase.resize(N); orbitalSet->eikr.resize(N); orbitalSet->NumValenceOrbs = NumValenceOrbs; orbitalSet->NumCoreOrbs = NumCoreOrbs; // Read in k-points int numOrbs = orbitalSet->getOrbitalSetSize(); int num = 0; if (root) { for (int iorb=0; iorb<N; iorb++) { int ti = SortBands[iorb].TwistIndex; PosType twist = TwistAngles[ti]; orbitalSet->kPoints[iorb] = orbitalSet->PrimLattice.k_cart(twist); orbitalSet->MakeTwoCopies[iorb] = (num < (numOrbs-1)) && SortBands[iorb].MakeTwoCopies; num += orbitalSet->MakeTwoCopies[iorb] ? 2 : 1; } } myComm->bcast(orbitalSet->kPoints); myComm->bcast(orbitalSet->MakeTwoCopies); // First, check to see if we have already read this in H5OrbSet set(H5FileName, spin, N); ///check mesh or ready for FFT grid bool havePsig=ReadGvectors_ESHDF(); app_log() << "MeshSize = (" << MeshSize[0] << ", " << MeshSize[1] << ", " << MeshSize[2] << ")\n"; int nx, ny, nz, bi, ti; nx=MeshSize[0]; ny=MeshSize[1]; nz=MeshSize[2]; Ugrid x_grid, y_grid, z_grid; BCtype_z xBC, yBC, zBC; xBC.lCode = PERIODIC; xBC.rCode = PERIODIC; yBC.lCode = PERIODIC; yBC.rCode = PERIODIC; zBC.lCode = PERIODIC; zBC.rCode = PERIODIC; x_grid.start = 0.0; x_grid.end = 1.0; x_grid.num = nx; y_grid.start = 0.0; y_grid.end = 1.0; y_grid.num = ny; z_grid.start = 0.0; z_grid.end = 1.0; z_grid.num = nz; // Create the multiUBspline object orbitalSet->MultiSpline = create_multi_UBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs); ////////////////////////////////////// // Create the MuffinTin APW splines // ////////////////////////////////////// orbitalSet->MuffinTins.resize(NumMuffinTins); for (int tin=0; tin<NumMuffinTins; tin++) { orbitalSet->MuffinTins[tin].Atom = tin; orbitalSet->MuffinTins[tin].set_center (MT_centers[tin]); orbitalSet->MuffinTins[tin].set_lattice(Lattice); orbitalSet->MuffinTins[tin].init_APW (MT_APW_rgrids[tin], MT_APW_lmax[tin], NumValenceOrbs); } for (int iat=0; iat<AtomicOrbitals.size(); iat++) { AtomicOrbitals[iat].set_num_bands(NumValenceOrbs); AtomicOrbitals[iat].allocate(); } int isComplex=1; if (root) { HDFAttribIO<int> h_isComplex(isComplex); h_isComplex.read(H5FileID, "/electrons/psi_r_is_complex"); } myComm->bcast(isComplex); if (!isComplex) { APP_ABORT("Expected complex orbitals in ES-HDF file, but found real ones."); } EinsplineSetBuilder::RotateBands_ESHDF(spin, orbitalSet); bool isCore = bcastSortBands(spin,N,root); if(isCore) { APP_ABORT("Core states not supported by ES-HDF yet."); } t_prep += c_prep.elapsed(); /** For valence orbitals, * - extended orbitals either in G or in R * - localized orbitals */ //this can potentially break Array<ComplexType,3> splineData(nx,ny,nz); if(havePsig)//perform FFT using FFTW { c_init.restart(); Array<ComplexType,3> FFTbox; FFTbox.resize(MeshSize[0], MeshSize[1], MeshSize[2]); fftw_plan FFTplan = fftw_plan_dft_3d (MeshSize[0], MeshSize[1], MeshSize[2], reinterpret_cast<fftw_complex*>(FFTbox.data()), reinterpret_cast<fftw_complex*>(FFTbox.data()), +1, FFTW_ESTIMATE); Vector<complex<double> > cG(MaxNumGvecs); //this will be parallelized with OpenMP for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { //Vector<complex<double> > cG; int ncg=0; int ti=SortBands[iorb].TwistIndex; c_h5.restart(); if(root) { ostringstream path; path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_g"; HDFAttribIO<Vector<complex<double> > > h_cG(cG); h_cG.read (H5FileID, path.str().c_str()); ncg=cG.size(); } myComm->bcast(ncg); if(ncg != Gvecs[0].size()) { APP_ABORT("Failed : ncg != Gvecs[0].size()"); } if(!root) cG.resize(ncg); myComm->bcast(cG); t_h5 += c_h5.elapsed(); c_unpack.restart(); unpack4fftw(cG,Gvecs[0],MeshSize,FFTbox); t_unpack+= c_unpack.elapsed(); c_fft.restart(); fftw_execute (FFTplan); t_fft+= c_fft.elapsed(); c_phase.restart(); fix_phase_rotate_c2c(FFTbox,splineData,TwistAngles[ti]); t_phase+= c_phase.elapsed(); c_spline.restart(); set_multi_UBspline_3d_z(orbitalSet->MultiSpline, ival, splineData.data()); t_spline+= c_spline.elapsed(); } fftw_destroy_plan(FFTplan); t_init+=c_init.elapsed(); } else { //this will be parallelized with OpenMP for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { //check dimension if(root) { ostringstream path; path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r"; HDFAttribIO<Array<complex<double>,3> > h_splineData(splineData); h_splineData.read(H5FileID, path.str().c_str()); } myComm->bcast(splineData); set_multi_UBspline_3d_z(orbitalSet->MultiSpline, ival, splineData.data()); } //return true; } app_log() << " READBANDS::PREP = " << t_prep << endl; app_log() << " READBANDS::H5 = " << t_h5 << endl; app_log() << " READBANDS::UNPACK = " << t_unpack << endl; app_log() << " READBANDS::FFT = " << t_fft << endl; app_log() << " READBANDS::PHASE = " << t_phase << endl; app_log() << " READBANDS::SPLINE = " << t_spline << endl; app_log() << " READBANDS::SUM = " << t_init << endl; //now localized orbitals for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival) { PosType twist=TwistAngles[SortBands[iorb].TwistIndex]; // Read atomic orbital information for (int iat=0; iat<AtomicOrbitals.size(); iat++) { app_log() << "Reading orbital " << iat << " for band " << ival << endl; AtomicOrbital<complex<double> > &orb = AtomicOrbitals[iat]; Array<complex<double>,2> radial_spline(orb.SplinePoints,orb.Numlm), poly_coefs(orb.PolyOrder+1,orb.Numlm); if (root) { int ti = SortBands[iorb].TwistIndex; int bi = SortBands[iorb].BandIndex; ostringstream path; path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/"; AtomicOrbital<complex<double> > &orb = AtomicOrbitals[iat]; ostringstream spline_path, poly_path; spline_path << path.str() << "radial_spline_" << iat; poly_path << path.str() << "poly_coefs_" << iat; HDFAttribIO<Array<complex<double>,2> > h_radial_spline(radial_spline); HDFAttribIO<Array<complex<double>,2> > h_poly_coefs(poly_coefs); h_radial_spline.read(H5FileID, spline_path.str().c_str()); h_poly_coefs.read (H5FileID, poly_path.str().c_str()); // cerr << "radial_spline.size = (" << radial_spline.size(0) // << ", " << radial_spline.size(1) << ")\n"; // cerr << "poly_coefs.size = (" << poly_coefs.size(0) // << ", " << poly_coefs.size(1) << ")\n"; } myComm->bcast(radial_spline); myComm->bcast(poly_coefs); AtomicOrbitals[iat].set_band (ival, radial_spline, poly_coefs, twist); } // Now read muffin tin data for (int tin=0; tin<NumMuffinTins; tin++) { // app_log() << "Reading data for muffin tin " << tin << endl; PosType twist, k; int lmax = MT_APW_lmax[tin]; int numYlm = (lmax+1)*(lmax+1); Array<complex<double>,2> u_lm_r(numYlm, MT_APW_num_radial_points[tin]); Array<complex<double>,1> du_lm_dr (numYlm); if (root) { int ti = SortBands[iorb].TwistIndex; int bi = SortBands[iorb].BandIndex; twist = TwistAngles[ti]; k = orbitalSet->PrimLattice.k_cart(twist); string uName = MuffinTinPath (ti, bi,tin) + "u_lm_r"; string duName = MuffinTinPath (ti, bi,tin) + "du_lm_dr"; HDFAttribIO<Array<complex<double>,2> > h_u_lm_r(u_lm_r); HDFAttribIO<Array<complex<double>,1> > h_du_lm_dr(du_lm_dr); h_u_lm_r.read(H5FileID, uName.c_str()); h_du_lm_dr.read(H5FileID, duName.c_str()); } myComm->bcast(u_lm_r); myComm->bcast(du_lm_dr); myComm->bcast(k); double Z = (double)IonTypes(tin); OrbitalSet->MuffinTins[tin].set_APW (ival, k, u_lm_r, du_lm_dr, Z); } } orbitalSet->AtomicOrbitals = AtomicOrbitals; for (int i=0; i<orbitalSet->AtomicOrbitals.size(); i++) orbitalSet->AtomicOrbitals[i].registerTimers(); //ExtendedMap_z[set] = orbitalSet->MultiSpline; }