Exemplo n.º 1
0
struct ath_3d_fft_plan *ath_3d_fft_create_plan(DomainS *pD, int gnx3, int gnx2,
				int gnx1, int gks, int gke, int gjs, int gje,
				int gis, int gie, ath_fft_data *data, int al,
				ath_fft_direction dir)
{
  int nbuf, tmp;
  struct ath_3d_fft_plan *ath_plan;

  if ((dir != ATH_FFT_FORWARD) && (dir != ATH_FFT_BACKWARD)) {
    ath_error("Invalid Athena FFT direction.\n");
  }

  /* Allocate memory for the plan */
  ath_plan = (struct ath_3d_fft_plan *)malloc(sizeof(struct ath_3d_fft_plan));
  if (ath_plan == NULL) {
    ath_error("[ath_3d_fft_plan] Couldn't malloc for FFT plan.\n");
  }
  /* Set forward/backward FFT */
  ath_plan->dir = dir;
  /* Set element count (for easy malloc and memset) */
  ath_plan->cnt = (gke-gks+1)*(gje-gjs+1)*(gie-gis+1);
  ath_plan->gcnt = gnx3*gnx2*gnx1;

  tmp = (al==0 ? 1 : 0);
  if (data != NULL) tmp = 0;

  /* If data == NULL, then allocate something (temporarily if tmp=1) */
  if (data == NULL)
    data = (ath_fft_data *)ath_3d_fft_malloc(ath_plan);
  if (data == NULL)
    ath_error("[ath_3d_fft_plan] Couln't malloc for FFT plan data.\n");

  /* Create the plan */
#ifdef FFT_BLOCK_DECOMP
  /* Block decomp library plans don't care if forward or backward */
  ath_plan->plan = fft_3d_create_plan(pD->Comm_Domain, gnx3, gnx2, gnx1, 
					gks, gke, gjs, gje, gis, gie, 
			    		gks, gke, gjs, gje, gis, gie, 
                            		0, 0, &nbuf);
#else /* FFT_BLOCK_DECOMP */
  if (dir == ATH_FFT_FORWARD) {
    ath_plan->plan = fftw_plan_dft_3d(gnx1, gnx2, gnx3, data, data,
					FFTW_FORWARD, FFTW_MEASURE);
  } else {
    ath_plan->plan = fftw_plan_dft_3d(gnx1, gnx2, gnx3, data, data,
					FFTW_BACKWARD, FFTW_MEASURE);
  }
#endif /* FFT_BLOCK_DECOMP */

  if (tmp) ath_3d_fft_free(data);

  return ath_plan;
}
Exemplo n.º 2
0
Arquivo: 3d.c Projeto: 3ki5tj/scinotes
int main(void)
{
  fftw_complex in[N0][N1][N2], out[N0][N1][N2], out2[N0][N1][N2]; /* double [2] */
  fftw_plan p;
  int i0, i1, i2;

  p = fftw_plan_dft_3d(N0, N1, N2, &in[0][0][0], &out[0][0][0], FFTW_FORWARD, FFTW_ESTIMATE);

  for (i0 = 0; i0 < N0; i0++)
  for (i1 = 0; i1 < N1; i1++)
  for (i2 = 0; i2 < N2; i2++) {
    in[i0][i1][i2][0] = sin(2*M_PI*i0/N0) + 3*cos(6*M_PI*i1/N1);
    in[i0][i1][i2][1] = 5*cos(4*M_PI*i2/N2);
  }

  fftw_execute(p);
  ft3d(N0, N1, N2, &in[0][0][0], &out2[0][0][0]);

  for (i0 = 0; i0 < N0; i0++)
  for (i1 = 0; i1 < N1; i1++)
  for (i2 = 0; i2 < N2; i2++)
    if ( fabs(out[i0][i1][i2][0]) > 1e-3  || fabs(out[i0][i1][i2][1]) > 1e-3
      || fabs(out2[i0][i1][i2][0]) > 1e-3 || fabs(out2[i0][i1][i2][1]) > 1e-3 )
      printf("%6d %6d %6d: %20.10f %20.10f |  %20.10f %20.10f\n",
        i0, i1, i2, out[i0][i1][i2][0], out[i0][i1][i2][1],
        out2[i0][i1][i2][0], out2[i0][i1][i2][1]);

  fftw_destroy_plan(p);
  fftw_cleanup();
  return 0;
}
Exemplo n.º 3
0
PetscErrorCode MatApply_USFFT_Private(Mat A, fftw_plan *plan, int direction, Vec x,Vec y)
{
#if 0
  PetscErrorCode ierr;
  PetscScalar    *r_array, *y_array;
  Mat_USFFT* = (Mat_USFFT*)(A->data);
#endif

  PetscFunctionBegin;
#if 0
  /* resample x to usfft->resample */
  ierr = MatResample_USFFT_Private(A, x);CHKERRQ(ierr);

  /* NB: for now we use outdim for both x and y; this will change once a full USFFT is implemented */
  ierr = VecGetArray(usfft->resample,&r_array);CHKERRQ(ierr);
  ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
  if (!*plan) { /* create a plan then execute it*/
    if (usfft->dof == 1) {
#if defined(PETSC_DEBUG_USFFT)
      ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "direction = %d, usfft->ndim = %d\n", direction, usfft->ndim);CHKERRQ(ierr);
      for (int ii = 0; ii < usfft->ndim; ++ii) {
        ierr = PetscPrintf(PetscObjectComm((PetscObject)A), "usfft->outdim[%d] = %d\n", ii, usfft->outdim[ii]);CHKERRQ(ierr);
      }
#endif

      switch (usfft->dim) {
      case 1:
        *plan = fftw_plan_dft_1d(usfft->outdim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag);
        break;
      case 2:
        *plan = fftw_plan_dft_2d(usfft->outdim[0],usfft->outdim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag);
        break;
      case 3:
        *plan = fftw_plan_dft_3d(usfft->outdim[0],usfft->outdim[1],usfft->outdim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag);
        break;
      default:
        *plan = fftw_plan_dft(usfft->ndim,usfft->outdim,(fftw_complex*)x_array,(fftw_complex*)y_array,direction,usfft->p_flag);
        break;
      }
      fftw_execute(*plan);
    } /* if (dof == 1) */
    else { /* if (dof > 1) */
      *plan = fftw_plan_many_dft(/*rank*/usfft->ndim, /*n*/usfft->outdim, /*howmany*/usfft->dof,
                                 (fftw_complex*)x_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1,
                                 (fftw_complex*)y_array, /*nembed*/usfft->outdim, /*stride*/usfft->dof, /*dist*/1,
                                 /*sign*/direction, /*flags*/usfft->p_flag);
      fftw_execute(*plan);
    } /* if (dof > 1) */
  } /* if (!*plan) */
  else {  /* if (*plan) */
    /* use existing plan */
    fftw_execute_dft(*plan,(fftw_complex*)x_array,(fftw_complex*)y_array);
  }
  ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
  ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
#endif
  PetscFunctionReturn(0);
} /* MatApply_USFFT_Private() */
Exemplo n.º 4
0
Arquivo: MPC.cpp Projeto: jyamu/qmc
void
MPC::compute_g_G(double &g_0, vector<double> &g_G, int N)
{
  double L = PtclRef->Lattice.WignerSeitzRadius;
  double Linv = 1.0/L;
  double Linv3 = Linv*Linv*Linv;
  // create an FFTW plan
  Array<complex<double>,3> rBox(N,N,N);
  Array<complex<double>,3> GBox(N,N,N);
  // app_log() << "Doing " << N << " x " << N << " x " << N << " FFT.\n";
  //create BC handler
  DTD_BConds<double,3,SUPERCELL_BULK> mybc(PtclRef->Lattice);
  // Fill the real-space array with f(r)
  double Ninv = 1.0/(double)N;
  TinyVector<double,3> u, r;
  for (int ix=0; ix<N; ix++)
  {
    u[0] = Ninv*ix;
    for (int iy=0; iy<N; iy++)
    {
      u[1] = Ninv*iy;
      for (int iz=0; iz<N; iz++)
      {
        u[2] = Ninv*iz;
        r = PtclRef->Lattice.toCart (u);
        //DTD_BConds<double,3,SUPERCELL_BULK>::apply (PtclRef->Lattice, r);
        //double rmag = std::sqrt(dot(r,r));
        double rmag = std::sqrt(mybc.apply_bc(r));
        if (rmag < L)
          rBox(ix,iy,iz) = -0.5*rmag*rmag*Linv3 + 1.5*Linv;
        else
          rBox(ix,iy,iz) = 1.0/rmag;
      }
    }
  }
  fftw_plan fft = fftw_plan_dft_3d
                  (N, N, N, (fftw_complex*)rBox.data(),
                   (fftw_complex*) GBox.data(), 1, FFTW_ESTIMATE);
  fftw_execute (fft);
  fftw_destroy_plan (fft);
  // Now, copy data into output, and add on analytic part
  double norm = Ninv*Ninv*Ninv;
  int numG = Gints.size();
  for (int iG=0; iG < numG; iG++)
  {
    TinyVector<int,OHMMS_DIM> gint = Gints[iG];
    for (int j=0; j<OHMMS_DIM; j++)
      gint[j] = (gint[j] + N)%N;
    g_G[iG] = norm * real(GBox(gint[0], gint[1], gint[2]));
  }
  g_0 = norm * real(GBox(0,0,0));
}
Exemplo n.º 5
0
void fft3dCPU(T1* d_data, int nx, int ny, int nz)
{
	cout << "Running forward xform 3d" << endl;

	fftw_plan plan;
	plan = fftw_plan_dft_3d(nz,
			ny, nx, (fftw_complex*) d_data,
			(fftw_complex*) d_data, FFTW_FORWARD, FFTW_ESTIMATE);

	// Inverse transform 'gridData_d' in place.
	fftw_execute(plan);
	fftw_destroy_plan(plan);
}
Exemplo n.º 6
0
convolution_plan::convolution_plan(int width, int height, int depth, int kw, int mode, int threadMaxCount) {

    switch (mode) {
    case 0:
        this->width = width;
        this->height = height;
        this->depth = depth;
        break;
    case 1:
        this->width = width + kw - 1;
        this->height = height + kw - 1;
        this->depth = depth + kw - 1;
        break;
    default:
        std::cout << mode << std::endl;
        throw std::invalid_argument("Warning: 3d convolution plan: Invalid mode");
    }

    if (threadMaxCount > 1) {
        fftw_init_threads(); // This MUST come before all other fftw calls
        fftw_plan_with_nthreads(threadMaxCount);
    }

    this->dim = 3;
    this->kw = kw;
    this->threadMaxCount = threadMaxCount;
    fftw_complex* benchmarkArray1 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * this->width * this->height * this->depth);
    fftw_complex* benchmarkArray2 = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * this->width * this->height * this->depth);

    this->forwardPlan = fftw_plan_dft_3d(this->depth, this->height, this->width, benchmarkArray1, benchmarkArray2, FFTW_FORWARD, FFTW_MEASURE);
    this->backwardPlan = fftw_plan_dft_3d(this->depth, this->height, this->width, benchmarkArray1, benchmarkArray2, FFTW_BACKWARD, FFTW_MEASURE);

    fftw_free(benchmarkArray1);
    fftw_free(benchmarkArray2);

    this->staticKernel = NULL;
}
Exemplo n.º 7
0
void fft3d(fftw_complex * out, double * k, fftw_complex * in, int * n, double * delta)
{
  fftw_plan plan;

  plan = fftw_plan_dft_3d(n[0], n[1], n[2], in, out, FFTW_FORWARD, FFTW_ESTIMATE); 
  k[0] = 2 * M_PI / (n[0] * delta[0]); 
  k[1] = 2 * M_PI / (n[1] * delta[1]); 
  k[2] = 2 * M_PI / (n[2] * delta[2]); 
  fftw_execute(plan); 
  fftw_destroy_plan(plan); 

  for (int i = 0; i < n[0]*n[1]*n[2]; i++)
  {
    out[i] /= (n[0]*n[1]*n[2]);
  }
}
Exemplo n.º 8
0
WGSLIB_DECL bool fft_varmap_3d(
	VarOut& Yout, 
	const std::vector<double>& data, 
	const std::vector<int>& has_point, 
	int N, int M, int K)
{

	omp_set_num_threads(fft_get_num_threads());
	fftw_plan_with_nthreads(fft_get_num_threads());

	fftw_complex *z,  *Z, *ZI;
	fftw_complex *z2, *Z2;
	fftw_complex *ni, *NI, *INI;

	fftw_plan p;

	int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1);

	fft_lock();
	z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	ZI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	ni = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	NI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	INI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	fft_unlock();

	#pragma omp parallel for
	for (int i = 0; i < ZS; ++i) {
		z[i][REAL] = z[i][IMAG] = 0;
		z2[i][REAL] = z2[i][IMAG] = 0;
		ni[i][REAL] = ni[i][IMAG] = 0;
	}

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < M; ++j) {
			for (int k = 0; k < K; ++k) {
				double v = data _p(i, j, k);
				int hp = has_point _p(i, j, k);

				ni _p3(i, j, k)[REAL] = hp;
				
				if (hp) {
					z _p3(i, j, k)[REAL] = v;
				
					z2 _p3(i, j, k)[REAL] = v * v;
				}
			}
		}
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z, Z, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2, Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, ni, NI, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	#pragma omp parallel for
	for (int h = 0; h < ZS; ++h) {
		fftw_complex Z2_I;
		fftw_complex Z_Z;
		fftw_complex I_Z2;
		fftw_complex I_I;

		mul_conj(Z2_I, Z2[h], NI[h]);
		mul_conj(Z_Z, Z[h], Z[h]);
		mul_conj(I_Z2, NI[h], Z2[h]);
		mul_conj(I_I, NI[h], NI[h]);

		Z[h][REAL] = Z2_I[REAL] - 2 * Z_Z[REAL] + I_Z2[REAL];
		Z[h][IMAG] = Z2_I[IMAG] - 2 * Z_Z[IMAG] + I_Z2[IMAG];

		NI[h][REAL] = I_I[REAL];
		NI[h][IMAG] = I_I[IMAG];
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z, ZI, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, NI, INI, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	#pragma omp parallel for
	for (int hx = 0; hx < N; ++hx) {
		for (int hy = 0; hy < M; ++hy) {
			for (int hz = 0; hz < K; ++hz) {
				double Y = ZI _p3(hx, hy, hz)[REAL]  / ZS;
				double N = INI _p3(hx, hy, hz)[REAL] / ZS;

				if (N > 0.01) {
					Yout.varmap _p(hx, hy, hz) = Y / (2 * N);
				} else {
					Yout.varmap _p(hx, hy, hz) = 0;
				}

				Yout.ni _p(hx, hy, hz) = N;
			}
		}
	}

	fft_lock();
	fftw_destroy_plan(p);
	fftw_free(z);
	fftw_free(Z);
	fftw_free(ZI);

	fftw_free(z2);
	fftw_free(Z2);

	fftw_free(ni);
	fftw_free(NI);
	fftw_free(INI);
	fft_unlock();

	return true;
}
Exemplo n.º 9
0
WGSLIB_DECL bool fft_crossvarmap_3d_declus(
	VarOut& Yout,

	const std::vector<double>& weigth1,
	const std::vector<double>& data1,
	const std::vector<int>& has_point1,

	const std::vector<double>& data2,
	const std::vector<int>& has_point2,
	int N, int M, int K)
{
	omp_set_num_threads(fft_get_num_threads());
	fftw_plan_with_nthreads(fft_get_num_threads());

	fftw_complex *i1i2,  *I1I2;
	fftw_complex *z1i2,  *Z1I2;
	fftw_complex *i1z2,  *I1Z2;
	fftw_complex *z1z2,  *Z1Z2;
	fftw_complex *w,     *W;
	fftw_complex *wz1,   *WZ1;
	fftw_complex *wz2,   *WZ2;
	fftw_complex *wz1z2, *WZ1Z2;
	
	fftw_plan p;

	int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1);

	fft_lock();
	i1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);  
	I1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	z1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); 
	Z1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	i1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); 
	I1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	z1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); 
	Z1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); 
	W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	wz1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	WZ1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	wz2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	WZ2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	wz1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS); 
	WZ1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	fft_unlock();

	#pragma omp parallel for
	for (int i = 0; i < ZS; ++i) {
		i1i2[i][REAL] = i1i2[i][IMAG] = 0;
		I1I2[i][REAL] = I1I2[i][IMAG] = 0;

		z1i2[i][REAL] = z1i2[i][IMAG] = 0;
		Z1I2[i][REAL] = Z1I2[i][IMAG] = 0;

		i1z2[i][REAL] = i1z2[i][IMAG] = 0;
		I1Z2[i][REAL] = I1Z2[i][IMAG] = 0;

		z1z2[i][REAL] = z1z2[i][IMAG] = 0;
		Z1Z2[i][REAL] = Z1Z2[i][IMAG] = 0;

		w[i][REAL] = w[i][IMAG] = 0;
		W[i][REAL] = W[i][IMAG] = 0;

		wz1[i][REAL] = wz1[i][IMAG] = 0;
		WZ1[i][REAL] = WZ1[i][IMAG] = 0;

		wz2[i][REAL] = wz2[i][IMAG] = 0;
		WZ2[i][REAL] = WZ2[i][IMAG] = 0;

		wz1z2[i][REAL] = wz1z2[i][IMAG] = 0;
		WZ1Z2[i][REAL] = WZ1Z2[i][IMAG] = 0;
	}

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < M; ++j) {
			for (int k = 0; k < K; ++k) {
				double v1 = data1 _p(i, j, k);
				double w1_ = weigth1 _p(i, j, k);
				int    i1 = has_point1 _p(i, j, k);
				
				double v2 = data2 _p(i, j, k);
				int    i2 = has_point2 _p(i, j, k);

				i1i2  _p3(i, j, k)[REAL] = i1 * i2;
				z1i2  _p3(i, j, k)[REAL] = v1 * i2;
				i1z2  _p3(i, j, k)[REAL] = i1 * v2;
				z1z2  _p3(i, j, k)[REAL] = v1 * v2;

				w     _p3(i, j, k)[REAL] = w1_;
				wz1   _p3(i, j, k)[REAL] = w1_ * v1;
				wz2   _p3(i, j, k)[REAL] = w1_ * v2;
				wz1z2 _p3(i, j, k)[REAL] = w1_ * v1 * v2;
			}
		}
	}

	///////////////////////////////////////////////////////////////////

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1i2, I1I2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1i2, Z1I2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1z2, I1Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1z2, Z1Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);
	
	////////////////////////////////////////////////////////////////////////////////////

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, w, W, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz1, WZ1, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz2, WZ2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, wz1z2, WZ1Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();
	fftw_execute(p);

	#pragma omp parallel for
	for (int h = 0; h < ZS; ++h) {
		fftw_complex W_I1I2;
		fftw_complex I1I2_W;

		fftw_complex WZ1Z2_I1I2;
		fftw_complex I1I2_WZ1Z2;

		fftw_complex WZ2_Z1I2;
		fftw_complex Z1I2_WZ2;


		fftw_complex WZ1_I1Z2;
		fftw_complex I1Z2_WZ1;

		fftw_complex W_Z1Z2;
		fftw_complex Z1Z2_W;


		mul_conj(W_I1I2, W[h], I1I2[h]);
		mul_conj(I1I2_W, I1I2[h], W[h]);


		mul_conj(WZ1Z2_I1I2, WZ1Z2[h], I1I2[h]);
		mul_conj(I1I2_WZ1Z2, I1I2[h], WZ1Z2[h]);

		mul_conj(WZ2_Z1I2, WZ2[h], Z1I2[h]);
		mul_conj(Z1I2_WZ2, Z1I2[h], WZ2[h]);


		mul_conj(WZ1_I1Z2, WZ1[h], I1Z2[h]);
		mul_conj(I1Z2_WZ1, I1Z2[h], WZ1[h]);

		mul_conj(W_Z1Z2, W[h], Z1Z2[h]);
		mul_conj(Z1Z2_W, Z1Z2[h], W[h]);

		Z1Z2[h][REAL] =
			WZ1Z2_I1I2[REAL] -
			WZ2_Z1I2[REAL] -
			WZ1_I1Z2[REAL] +
			W_Z1Z2[REAL] +

			Z1Z2_W[REAL] -
			I1Z2_WZ1[REAL] -
			Z1I2_WZ2[REAL] +
			I1I2_WZ1Z2[REAL];

		Z1Z2[h][IMAG] =
			WZ1Z2_I1I2[IMAG] -
			WZ2_Z1I2[IMAG] -
			WZ1_I1Z2[IMAG] +
			W_Z1Z2[IMAG] +

			Z1Z2_W[IMAG] -
			I1Z2_WZ1[IMAG] -
			Z1I2_WZ2[IMAG] +
			I1I2_WZ1Z2[IMAG];

		W[h][REAL] = 2 * (W_I1I2[REAL] + I1I2_W[REAL]);
		W[h][IMAG] = 2 * (W_I1I2[IMAG] + I1I2_W[IMAG]);
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z1Z2, z1z2, FFTW_BACKWARD, FFTW_ESTIMATE); 
	fft_unlock();

	fftw_execute(p);
	
	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, W, w, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	#pragma omp parallel for
	for (int hx = 0; hx < N; ++hx) {
		for (int hy = 0; hy < M; ++hy) {
			for (int hz = 0; hz < K; ++hz) {
				double Y = z1z2 _p3(hx, hy, hz)[REAL] / ZS;
				double N = w _p3(hx, hy, hz)[REAL] / ZS;

				if (N > 0.01) {
					Yout.varmap _p(hx, hy, hz) = Y / (N);
				}
				else {
					Yout.varmap _p(hx, hy, hz) = 0;
				}

				Yout.ni _p(hx, hy, hz) = N;
			}
		}
	}

	fft_lock();
	fftw_destroy_plan(p);

	fftw_free(i1i2);
	fftw_free(I1I2);

	fftw_free(z1i2);
	fftw_free(Z1I2);

	fftw_free(i1z2);
	fftw_free(I1Z2);

	fftw_free(z1z2);
	fftw_free(Z1Z2);

	fftw_free(w);
	fftw_free(W);

	fftw_free(wz1);
	fftw_free(WZ1);

	fftw_free(wz2);
	fftw_free(WZ2);
	
	fftw_free(wz1z2);
	fftw_free(WZ1Z2);
	fft_unlock();

	return true;
}
Exemplo n.º 10
0
WGSLIB_DECL bool fft_crossvarmap_3d(
	VarOut& Yout,
	const std::vector<double>& data1,
	const std::vector<int>& has_point1,
	const std::vector<double>& data2,
	const std::vector<int>& has_point2,
	int N, int M, int K)
{
	omp_set_num_threads(fft_get_num_threads());
	fftw_plan_with_nthreads(fft_get_num_threads());

	fftw_complex *Z1I;
	fftw_complex *I1I;

	fftw_complex *z1z2, *Z1Z2;

	fftw_complex *i2z1, *I2Z1;
	fftw_complex *i1z2, *I1Z2;

	fftw_complex *i1i2, *I1I2;
	
	fftw_plan p;


	int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1);

	fft_lock();
	Z1I = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	I1I = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	z1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	i2z1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	I2Z1 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	i1z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	I1Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	i1i2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	I1I2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	fft_unlock();

	#pragma omp parallel for
	for (int i = 0; i < ZS; ++i) {
		z1z2[i][REAL] = z1z2[i][IMAG] = 0;
		i1z2[i][REAL] = i1z2[i][IMAG] = 0;
		i2z1[i][REAL] = i2z1[i][IMAG] = 0;
		i1i2[i][REAL] = i1i2[i][IMAG] = 0;
	}

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < M; ++j) {
			for (int k = 0; k < K; ++k) {
				double v1 = data1 _p(i, j, k);
				double v2 = data2 _p(i, j, k);
				int hp1 = has_point1 _p(i, j, k);
				int hp2 = has_point2 _p(i, j, k);	
		
				z1z2 _p3(i, j, k)[REAL] =  v1 * v2;
				i1z2 _p3(i, j, k)[REAL] = hp1 * v2;
				i2z1 _p3(i, j, k)[REAL] = hp2 * v1;
				i1i2 _p3(i, j, k)[REAL] = hp1 * hp2;
			}
		}
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z1z2, Z1Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1z2, I1Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i2z1, I2Z1, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, i1i2, I1I2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	#pragma omp parallel for
	for (int h = 0; h < ZS; ++h) {
		fftw_complex A;   
		fftw_complex B;  

		fftw_complex C;  
		fftw_complex D;  
		fftw_complex I1_I2;  
		
		mul_conj(A, Z1Z2[h],  I1I2[h]);
		mul_conj(D, I1I2[h],  Z1Z2[h]);

		mul_conj(B, I2Z1[h], I1Z2[h]);
		mul_conj(C, I1Z2[h], I2Z1[h]);

		mul_conj(I1_I2, I1I2[h], I1I2[h]);

		Z1Z2[h][REAL] = A[REAL] - B[REAL] - C[REAL] + D[REAL];
		Z1Z2[h][IMAG] = A[IMAG] - B[IMAG] - C[IMAG] + D[IMAG];

		I1I2[h][REAL] = I1_I2[REAL];
		I1I2[h][IMAG] = I1_I2[IMAG];
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z1Z2, Z1I, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, I1I2, I1I, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	#pragma omp parallel for
	for (int hx = 0; hx < N; ++hx) {
		for (int hy = 0; hy < M; ++hy) {
			for (int hz = 0; hz < K; ++hz) {
				double Y = Z1I _p3(hx, hy, hz)[REAL] / ZS;
				double N = I1I _p3(hx, hy, hz)[REAL] / ZS;

				if (N > 0.01) {
					Yout.varmap _p(hx, hy, hz) =  Y / (2 * N);
				} else {
					Yout.varmap _p(hx, hy, hz) = 0;
				}

				Yout.ni _p(hx, hy, hz) = N;

			}
		}
	}

	fft_lock();
	fftw_destroy_plan(p);

	fftw_free(Z1I);
	fftw_free(I1I);
	
	fftw_free(z1z2); 
	fftw_free(Z1Z2);
	
	fftw_free(i2z1); 
	fftw_free(I2Z1); 
	
	fftw_free(i1z2);
	fftw_free(I1Z2);
	
	fftw_free(i1i2); 
	fftw_free(I1I2);
	fft_unlock();

	return true;
}
Exemplo n.º 11
0
WGSLIB_DECL bool fft_varmap_3d_declus(
	VarOut& Yout,
	const std::vector<double>& data,
	const std::vector<double>& weigth,
	const std::vector<int>& has_point,
	int N, int M, int K)
{
	omp_set_num_threads(fft_get_num_threads());
	fftw_plan_with_nthreads(fft_get_num_threads());

	fftw_complex *z, *Z, *ZI;
	fftw_complex *z2, *Z2;
	fftw_complex *ni, *NI, *INI;

	fftw_complex *w, *W;

	fftw_complex *z2w, *Z2W;
	fftw_complex *zw, *ZW;
	
	fftw_plan p;


	int ZS = (2 * N + 1) * (2 * M + 1) * (2 * K + 1);

	fft_lock();
	z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	ZI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);

	z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z2 = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	ni = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	NI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	INI = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	z2w = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	Z2W = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	
	zw = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	ZW = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)* ZS);
	fft_unlock();


	#pragma omp parallel for
	for (int i = 0; i < ZS; ++i) {
		z[i][REAL] = z[i][IMAG] = 0;
		z2[i][REAL] = z2[i][IMAG] = 0;
		ni[i][REAL] = ni[i][IMAG] = 0;
		w[i][REAL] = w[i][IMAG] = 0;
		z2w[i][REAL] = z2w[i][IMAG] = 0;
		zw[i][REAL] = zw[i][IMAG] = 0;
	}

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < M; ++j) {
			for (int k = 0; k < K; ++k) {
				double v = data _p(i, j, k);
				double w_ = weigth _p(i, j, k);
				int hp = has_point _p(i, j, k);

				ni _p3(i, j, k)[REAL] = hp;
				
				if (hp) {
					z _p3(i, j, k)[REAL] = v;
					
					z2 _p3(i, j, k)[REAL] = v * v;
					
					zw _p3(i, j, k)[REAL] = v * w_;

					z2w _p3(i, j, k)[REAL] = v * v * w_;

					w _p3(i, j, k)[REAL] = w_;

				}
			}
		}
	}


	//LEVA z para o dominio da frequencia
	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z, Z, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, w, W, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, zw, ZW, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2w, Z2W, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, z2, Z2, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, ni, NI, FFTW_FORWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	//Realiza convolução
	#pragma omp parallel for
	for (int h = 0; h < ZS; ++h) {
		fftw_complex Z2W_I;
		fftw_complex ZW_Z;
		fftw_complex W_Z2;
		fftw_complex Z2_W;
		fftw_complex Z_ZW;
		fftw_complex I_Z2W;
		fftw_complex W_I;
		fftw_complex I_W;

		mul_conj(Z2W_I, Z2W[h], NI[h]);
		mul_conj(ZW_Z, ZW[h], Z[h]);
		mul_conj(W_Z2, W[h], Z2[h]);
		mul_conj(Z2_W, Z2[h], W[h]);
		mul_conj(Z_ZW, Z[h], ZW[h]);
		mul_conj(I_Z2W, NI[h], Z2W[h]);

		mul_conj(I_W, NI[h], W[h]);
		mul_conj(W_I, W[h], NI[h]);

		Z[h][REAL] = Z2W_I[REAL] - 2 * ZW_Z[REAL] + W_Z2[REAL] + 
					 Z2_W[REAL] - 2 * Z_ZW[REAL] + I_Z2W[REAL];

		Z[h][IMAG] = Z2W_I[IMAG] - 2 * ZW_Z[IMAG] + W_Z2[IMAG] +
					 Z2_W[IMAG] - 2 * Z_ZW[IMAG] + I_Z2W[IMAG];

		NI[h][REAL] = 2 * (W_I[REAL] + I_W[REAL]);
		NI[h][IMAG] = 2 * (W_I[IMAG] + I_W[IMAG]);
	}

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, Z, ZI, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);

	fft_lock();
	p = fftw_plan_dft_3d(2 * N + 1, 2 * M + 1, 2 * K + 1, NI, INI, FFTW_BACKWARD, FFTW_ESTIMATE);
	fft_unlock();

	fftw_execute(p);
	
	#pragma omp parallel for
	for (int hx = 0; hx < N; ++hx) {
		for (int hy = 0; hy < M; ++hy) {
			for (int hz = 0; hz < K; ++hz) {
				double Y = ZI _p3(hx, hy, hz)[REAL] / ZS;
				double N = INI _p3(hx, hy, hz)[REAL] / ZS;

				if (N > 0.01) {
					Yout.varmap _p(hx, hy, hz) = Y / N;
				}
				else {
					Yout.varmap _p(hx, hy, hz) = 0;
				}

				Yout.ni _p(hx, hy, hz) = N;
			}
		}
	}

	fft_lock();
	fftw_destroy_plan(p);
	fftw_free(z);
	fftw_free(Z);
	fftw_free(ZI);

	fftw_free(z2);
	fftw_free(Z2);

	fftw_free(zw);
	fftw_free(ZW);

	fftw_free(z2w);
	fftw_free(Z2W);

	fftw_free(w);
	fftw_free(W);

	fftw_free(ni);
	fftw_free(NI);
	fftw_free(INI);
	fft_unlock();

	return true;
}
Exemplo n.º 12
0
Arquivo: MPC.cpp Projeto: jyamu/qmc
void
MPC::init_spline()
{
  Array<complex<double>,3> rBox(SplineDim[0], SplineDim[1], SplineDim[2]),
        GBox(SplineDim[0], SplineDim[1], SplineDim[2]);
  Array<double,3> splineData(SplineDim[0], SplineDim[1], SplineDim[2]);
  GBox = complex<double>();
  Vconst = 0.0;
  // Now fill in elements of GBox
  double vol = PtclRef->Lattice.Volume;
  double volInv = 1.0/vol;
  for (int iG=0; iG < Gvecs.size(); iG++)
  {
    TinyVector<int,OHMMS_DIM> gint = Gints[iG];
    PosType G = Gvecs[iG];
    double G2 = dot(G,G);
    TinyVector<int,OHMMS_DIM> index;
    for (int j=0; j<OHMMS_DIM; j++)
      index[j] = (gint[j] + SplineDim[j]) % SplineDim[j];
    if (!(index[0]==0 && index[1]==0 && index[2]==0))
    {
      GBox(index[0], index[1], index[2]) = vol *
                                           Rho_G[iG] * (4.0*M_PI*volInv/G2 - f_G[iG]);
      Vconst -= 0.5 * vol * vol * norm(Rho_G[iG])
                * (4.0*M_PI*volInv/G2 - f_G[iG]);
    }
  }
  // G=0 component calculated seperately
  GBox(0,0,0) = -vol * f_0 * Rho_G[0];
  Vconst += 0.5 * vol * vol * f_0 * norm(Rho_G[0]);
  app_log() << "  Constant potential = " << Vconst << endl;
  fftw_plan fft = fftw_plan_dft_3d
                  (SplineDim[0], SplineDim[1], SplineDim[2], (fftw_complex*)GBox.data(),
                   (fftw_complex*) rBox.data(), -1, FFTW_ESTIMATE);
  fftw_execute (fft);
  fftw_destroy_plan (fft);
  for (int i0=0; i0<SplineDim[0]; i0++)
    for (int i1=0; i1<SplineDim[1]; i1++)
      for (int i2=0; i2<SplineDim[2]; i2++)
        splineData(i0, i1, i2) = real(rBox(i0,i1,i2));
  BCtype_d bc0, bc1, bc2;
  Ugrid grid0, grid1, grid2;
  grid0.start=0.0;
  grid0.end=1.0;
  grid0.num = SplineDim[0];
  grid1.start=0.0;
  grid1.end=1.0;
  grid1.num = SplineDim[1];
  grid2.start=0.0;
  grid2.end=1.0;
  grid2.num = SplineDim[2];
  bc0.lCode = bc0.rCode = PERIODIC;
  bc1.lCode = bc1.rCode = PERIODIC;
  bc2.lCode = bc2.rCode = PERIODIC;
  VlongSpline = create_UBspline_3d_d (grid0, grid1, grid2, bc0, bc1, bc2,
                                      splineData.data());
//     grid0.num = PtclRef->Density_r.size(0);
//     grid1.num = PtclRef->Density_r.size(1);
//     grid2.num = PtclRef->Density_r.size(2);
//     DensitySpline = create_UBspline_3d_d (grid0, grid1, grid2, bc0, bc1, bc2,
// 					  PtclRef->Density_r.data());
}
Exemplo n.º 13
0
/*
 *
 * To use FFTW in parallel, the lattice needs to be redistributed;
 * this Fourier-transform function takes a number (n) of 3D blocks;
 * the n blocks shall be destributed over as many prosesses as possible;
 * then the parallel FFTW is called, destributing the z-axis over remaining processes
 *
 */
void
qpb_ft(qpb_complex **out, qpb_complex **in, int n, int mom[][4], int nmom)
{
  int rank = problem_params.proc_id;
  int nprocs = problem_params.nprocs;
  int Lz = problem_params.g_dim[1];
  int Ly = problem_params.g_dim[2];
  int Lx = problem_params.g_dim[3];

  int lz = problem_params.l_dim[1];
  int ly = problem_params.l_dim[2];
  int lx = problem_params.l_dim[3];
  int vol3d = Lx*Ly*Lz;
  int lvol3d = lx*ly*lz;
  MPI_Comm comm_cart = problem_params.mpi_comm_cart;

  int nprocs_n = 0;
  for(int i=1; i<nprocs+1; i++)
    if((n % i) == 0)
      nprocs_n = i;

  int n_loc = n / nprocs_n;

  fftw_complex *corr[n_loc];
  qpb_complex *swap = NULL;
  if(rank < nprocs_n)
    for(int i=0; i<n_loc; i++)
      corr[i] = fftw_malloc(sizeof(fftw_complex)*vol3d);

  if(rank < nprocs_n)
    swap = qpb_alloc(sizeof(qpb_complex)*vol3d);

  for(int i=0; i<n_loc; i++)
    for(int j=0; j<nprocs_n; j++)
      {
	MPI_Gather(in[i+j*n_loc], lvol3d*sizeof(qpb_complex), MPI_BYTE, 
		   swap, lvol3d*sizeof(qpb_complex), MPI_BYTE, j, MPI_COMM_WORLD);     
	if(rank == j)
	  {
	    for(int p=0; p<nprocs; p++)
	      {
		int coords[ND-1];
		qpb_complex *ptr = swap + p*lvol3d;
		MPI_Cart_coords(comm_cart, p, ND-1, coords);
		int zoff = coords[0]*lz;
		int yoff = coords[1]*ly;
		int xoff = coords[2]*lx;
		for(int z=zoff; z<lz+zoff; z++)
		  for(int y=yoff; y<ly+yoff; y++)
		    for(int x=xoff; x<lx+xoff; x++)
		      {
			corr[i][x + y*Lx + z*Lx*Ly][0] = ptr->re;
			corr[i][x + y*Lx + z*Lx*Ly][1] = ptr->im;
			ptr++;
		      }
	      } 
	  }
	MPI_Barrier(MPI_COMM_WORLD);
      }

  if(rank < nprocs_n)
    for(int i=0; i<n_loc; i++)
      {
	fftw_plan plan = fftw_plan_dft_3d(Lz, Ly, Lx, corr[i], corr[i], 
					  FFTW_FORWARD, FFTW_ESTIMATE);
	fftw_execute(plan);
	fftw_destroy_plan(plan);
      }

  if(rank < nprocs_n)
    for(int i=0; i<n_loc; i++)
      for(int p=0; p<nmom; p++)
	{
	  int kx = (Lx + mom[p][3]) % Lx;
	  int ky = (Ly + mom[p][2]) % Ly;
	  int kz = (Lz + mom[p][1]) % Lz;
	  out[i][p] = (qpb_complex){corr[i][kx + ky*Lx + kz*Lx*Ly][0],
				    corr[i][kx + ky*Lx + kz*Lx*Ly][1]};
	}

  for(int p=1; p<nprocs_n; p++)
    for(int i=0; i<n_loc; i++)
      {
	if(rank == 0)
	  MPI_Recv(out[p*n_loc+i], nmom*sizeof(qpb_complex), MPI_BYTE, p, p, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	if(rank == p)
	  MPI_Send(out[i], nmom*sizeof(qpb_complex), MPI_BYTE, 0, p, MPI_COMM_WORLD);
      }

  if(rank < nprocs_n)
    {
      for(int i=0; i<n_loc; i++)
	fftw_free(corr[i]);
      free(swap);
    }
  return;
}
bool c_FourierTransfrom::ifftw_complex_3d(const Mat_<Vec6d> &_input,
                                         Mat_<Vec6d> &_output)
{
    size_t height = _input.rows;
    size_t width = _input.cols;
    size_t n_channels = _input.channels() / 2;
    size_t n_pixels = height * width;
    size_t n_data = n_pixels * n_channels;

    fftw_complex *in, *out;
    fftw_plan p;

    in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * n_data);
    out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * n_data);

    p = fftw_plan_dft_3d(height, width, n_channels, in, out, FFTW_BACKWARD,
                         FFTW_ESTIMATE);

    /*!< prepare the data */
    for (size_t i_row = 0; i_row < height; ++i_row)
    {
        const Vec3d *p = _input.ptr<Vec3d>(i_row);
        for (size_t i_col = 0; i_col < width; ++i_col)
        {
            size_t index = i_row * width + i_col;
            for (size_t k = 0; k < n_channels; ++k)
            {
                in[n_pixels * k + index][0] = p[i_col][k];
                in[n_pixels * k + index][1] = p[i_col][k + n_channels];
            }
#if 0
            in[index][0] = p[i_col][4];
            in[index][1] = p[i_col][5];
            in[n_pixels + index][0] = p[i_col][2];
            in[n_pixels + index][1] = p[i_col][3];
            in[n_pixels * 2 + index][0] = p[i_col][0];
            in[n_pixels * 2 + index][1] = p[i_col][1];
#endif
        }
    }

    fftw_execute(p);

    /*!< write back data */
    _output = Mat_<Vec6d>::zeros(_input.size());
    for (size_t i_row = 0; i_row < height; ++i_row)
    {
        Vec6d *p = _output.ptr<Vec6d>(i_row);
        for (size_t i_col = 0; i_col < width; ++i_col)
        {
            size_t index = i_row * width + i_col;
            for (size_t k = 0; k < n_channels; ++k)
            {
                p[i_col][k] = out[n_pixels * k + index][0];
                p[i_col][k + n_channels] = out[n_pixels * k + index][1];
            }
#if 0
            p[i_col][0] = out[n_pixels * 2 + index][0];
            p[i_col][1] = out[n_pixels + index][0];
            p[i_col][2] = out[index][0];
            p[i_col][3] = out[n_pixels * 2 + index][1];
            p[i_col][4] = out[n_pixels + index][1];
            p[i_col][5] = out[index][1];
#endif
        }
    }

    _output /= n_data;

    fftw_destroy_plan(p);
    fftw_free(in);
    fftw_free(out);

    return true;
}
void EinsplineSetBuilder::ReadBands_ESHDF(int spin, EinsplineSetExtended<double>* orbitalSet)
{
  update_token(__FILE__,__LINE__,"ReadBands_ESHDF:double");
  ReportEngine PRE("EinsplineSetBuilder","ReadBands_ESHDF(EinsplineSetExtended<double>*");
  vector<AtomicOrbital<double> > realOrbs(AtomicOrbitals.size());
  for (int iat=0; iat<realOrbs.size(); iat++)
  {
    AtomicOrbital<complex<double> > &corb (AtomicOrbitals[iat]);
    realOrbs[iat].set_pos  (corb.Pos);
    realOrbs[iat].set_lmax (corb.lMax);
    realOrbs[iat].set_cutoff (corb.CutoffRadius);
    realOrbs[iat].set_spline (corb.SplineRadius, corb.SplinePoints);
    realOrbs[iat].set_polynomial (corb.PolyRadius, corb.PolyOrder);
    realOrbs[iat].Lattice = corb.Lattice;
  }
  bool root = myComm->rank()==0;
  // bcast other stuff
  myComm->bcast (NumDistinctOrbitals);
  myComm->bcast (NumValenceOrbs);
  myComm->bcast (NumCoreOrbs);
  int N = NumDistinctOrbitals;
  orbitalSet->kPoints.resize(N);
  orbitalSet->MakeTwoCopies.resize(N);
  orbitalSet->StorageValueVector.resize(N);
  orbitalSet->BlendValueVector.resize(N);
  orbitalSet->StorageLaplVector.resize(N);
  orbitalSet->BlendLaplVector.resize(N);
  orbitalSet->StorageGradVector.resize(N);
  orbitalSet->BlendGradVector.resize(N);
  orbitalSet->StorageHessVector.resize(N);
  orbitalSet->StorageGradHessVector.resize(N);
  orbitalSet->phase.resize(N);
  orbitalSet->eikr.resize(N);
  orbitalSet->NumValenceOrbs = NumValenceOrbs;
  orbitalSet->NumCoreOrbs    = NumCoreOrbs;
  orbitalSet->FirstOrderSplines.resize(IonPos.size());
  // Read in k-points
  int numOrbs = orbitalSet->getOrbitalSetSize();
  int num = 0;
  vector<BandInfo>& SortBands(*FullBands[spin]);
  if (root)
  {
    for (int iorb=0; iorb<N; iorb++)
    {
      int ti = SortBands[iorb].TwistIndex;
      PosType twist  = TwistAngles[ti];
      orbitalSet->kPoints[iorb] = orbitalSet->PrimLattice.k_cart(twist);
      orbitalSet->MakeTwoCopies[iorb] =
        (num < (numOrbs-1)) && SortBands[iorb].MakeTwoCopies;
      num += orbitalSet->MakeTwoCopies[iorb] ? 2 : 1;
    }
    PosType twist0 = TwistAngles[SortBands[0].TwistIndex];
    for (int i=0; i<OHMMS_DIM; i++)
      if (std::fabs(std::fabs(twist0[i]) - 0.5) < 1.0e-8)
        orbitalSet->HalfG[i] = 1;
      else
        orbitalSet->HalfG[i] = 0;
    EinsplineSetBuilder::RotateBands_ESHDF(spin, orbitalSet);
  }
  myComm->bcast(orbitalSet->kPoints);
  myComm->bcast(orbitalSet->MakeTwoCopies);
  myComm->bcast(orbitalSet->HalfG);
  // First, check to see if we have already read this in
  H5OrbSet set(H5FileName, spin, N);
  bool havePsir=!ReadGvectors_ESHDF();
  app_log() << "MeshSize = (" << MeshSize[0] << ", "
            << MeshSize[1] << ", " << MeshSize[2] << ")\n";
  //int nx, ny, nz, bi, ti;
  int nx, ny, nz;
  nx=MeshSize[0];
  ny=MeshSize[1];
  nz=MeshSize[2];
  Ugrid x_grid, y_grid, z_grid;
  BCtype_d xBC, yBC, zBC;
  if (orbitalSet->HalfG[0])
  {
    xBC.lCode = ANTIPERIODIC;
    xBC.rCode = ANTIPERIODIC;
  }
  else
  {
    xBC.lCode = PERIODIC;
    xBC.rCode = PERIODIC;
  }
  if (orbitalSet->HalfG[1])
  {
    yBC.lCode = ANTIPERIODIC;
    yBC.rCode = ANTIPERIODIC;
  }
  else
  {
    yBC.lCode = PERIODIC;
    yBC.rCode = PERIODIC;
  }
  if (orbitalSet->HalfG[2])
  {
    zBC.lCode = ANTIPERIODIC;
    zBC.rCode = ANTIPERIODIC;
  }
  else
  {
    zBC.lCode = PERIODIC;
    zBC.rCode = PERIODIC;
  }
  x_grid.start = 0.0;
  x_grid.end = 1.0;
  x_grid.num = nx;
  y_grid.start = 0.0;
  y_grid.end = 1.0;
  y_grid.num = ny;
  z_grid.start = 0.0;
  z_grid.end = 1.0;
  z_grid.num = nz;
  // Create the multiUBspline object
  orbitalSet->MultiSpline =
    create_multi_UBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs);
  if (HaveOrbDerivs)
  {
    orbitalSet->FirstOrderSplines.resize(IonPos.size());
    for (int ion=0; ion<IonPos.size(); ion++)
      for (int dir=0; dir<OHMMS_DIM; dir++)
        orbitalSet->FirstOrderSplines[ion][dir] =
          create_multi_UBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs);
  }
  //////////////////////////////////////
  // Create the MuffinTin APW splines //
  //////////////////////////////////////
  orbitalSet->MuffinTins.resize(NumMuffinTins);
  for (int tin=0; tin<NumMuffinTins; tin++)
  {
    orbitalSet->MuffinTins[tin].Atom = tin;
    orbitalSet->MuffinTins[tin].set_center (MT_centers[tin]);
    orbitalSet->MuffinTins[tin].set_lattice(Lattice);
    orbitalSet->MuffinTins[tin].init_APW
    (MT_APW_rgrids[tin], MT_APW_lmax[tin],
     NumValenceOrbs);
  }
  for (int iat=0; iat<realOrbs.size(); iat++)
  {
    realOrbs[iat].set_num_bands(NumValenceOrbs);
    realOrbs[iat].allocate();
  }
  int isComplex;
  if (root)
  {
    HDFAttribIO<int> h_isComplex(isComplex);
    h_isComplex.read(H5FileID, "/electrons/psi_r_is_complex");
  }
  myComm->bcast(isComplex);
  bool isCore = bcastSortBands(spin,N,root);
  if(isCore)
  {
    APP_ABORT("Core states not supported by ES-HDF yet.");
  }
  //this is common
  Array<double,3> splineData(nx,ny,nz);
  if(havePsir)
  {
    if(isComplex)
    {
      app_log() << "   Reading complex psi_r and convert to real" << endl;
      Array<complex<double>,3> rawData;
      for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
      {
        int ti=SortBands[iorb].TwistIndex;
        if(root)
        {
          ostringstream path;
          path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex
               << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r";
          HDFAttribIO<Array<complex<double>,3> >  h_splineData(rawData);
          h_splineData.read(H5FileID, path.str().c_str());
        }
        myComm->bcast(rawData);
        //multiply twist factor and project on the real
        fix_phase_c2r(rawData,splineData,TwistAngles[ti]);
        set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data());
      }
    }
    else
    {
      app_log() << "   Reading real psi_r" << endl;
      for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
      {
        if(root)
        {
          ostringstream path;
          path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex
               << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r";
          HDFAttribIO<Array<double,3> >  h_splineData(splineData);
          h_splineData.read(H5FileID, path.str().c_str());
        }
        myComm->bcast(splineData);
        set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data());
      }
    }
  }
  else
  {
    Array<ComplexType,3> FFTbox;
    FFTbox.resize(MeshSize[0], MeshSize[1], MeshSize[2]);
    fftw_plan FFTplan = fftw_plan_dft_3d
                        (MeshSize[0], MeshSize[1], MeshSize[2],
                         reinterpret_cast<fftw_complex*>(FFTbox.data()),
                         reinterpret_cast<fftw_complex*>(FFTbox.data()),
                         +1, FFTW_ESTIMATE);
    for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
    {
      Vector<complex<double> > cG;
      int ncg=0;
      int ti=SortBands[iorb].TwistIndex;
      if(root)
      {
        ostringstream path;
        path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex
             << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_g";
        HDFAttribIO<Vector<complex<double> > >  h_cG(cG);
        h_cG.read (H5FileID, path.str().c_str());
        ncg=cG.size();
      }
      myComm->bcast(ncg);
      if(ncg != Gvecs[0].size())
      {
        APP_ABORT("Failed : ncg != Gvecs[0].size()");
      }
      if(!root)
        cG.resize(ncg);
      myComm->bcast(cG);
      unpack4fftw(cG,Gvecs[0],MeshSize,FFTbox);
      fftw_execute (FFTplan);
      fix_phase_rotate_c2r(FFTbox,splineData,TwistAngles[ti]);
      set_multi_UBspline_3d_d (orbitalSet->MultiSpline, ival, splineData.data());
    }
    fftw_destroy_plan(FFTplan);
  }
  for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
  {
    // Read atomic orbital information
    for (int iat=0; iat<realOrbs.size(); iat++)
    {
      app_log() << "Reading orbital " << iat << " for band " << ival << endl;
      AtomicOrbital<double> &orb = realOrbs[iat];
      //AtomicOrbital<complex<double> > &orb = realOrbs[iat];
      Array<complex<double>,2> radial_spline(orb.SplinePoints,orb.Numlm),
            poly_coefs(orb.PolyOrder+1,orb.Numlm);
      int ti   = SortBands[iorb].TwistIndex;
      if (root)
      {
        int bi   = SortBands[iorb].BandIndex;
        ostringstream path;
        path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/";
        ostringstream spline_path, poly_path;
        spline_path << path.str() << "radial_spline_" << iat;
        poly_path   << path.str() << "poly_coefs_"    << iat;
        HDFAttribIO<Array<complex<double>,2> > h_radial_spline(radial_spline);
        HDFAttribIO<Array<complex<double>,2> > h_poly_coefs(poly_coefs);
        h_radial_spline.read(H5FileID, spline_path.str().c_str());
        h_poly_coefs.read   (H5FileID, poly_path.str().c_str());
      }
      myComm->bcast(radial_spline);
      myComm->bcast(poly_coefs);
      realOrbs[iat].set_band (ival, radial_spline, poly_coefs, TwistAngles[ti]);
    }
  }
  for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
  {
    // Now read muffin tin data
    for (int tin=0; tin<NumMuffinTins; tin++)
    {
      // app_log() << "Reading data for muffin tin " << tin << endl;
      PosType twist, k;
      int lmax = MT_APW_lmax[tin];
      int numYlm = (lmax+1)*(lmax+1);
      Array<complex<double>,2>
      u_lm_r(numYlm, MT_APW_num_radial_points[tin]);
      Array<complex<double>,1> du_lm_dr (numYlm);
      int ti   = SortBands[iorb].TwistIndex;
      if (root)
      {
        int bi   = SortBands[iorb].BandIndex;
        twist = TwistAngles[ti];
        k = orbitalSet->PrimLattice.k_cart(twist);
        string uName  = MuffinTinPath (ti, bi,tin) + "u_lm_r";
        string duName = MuffinTinPath (ti, bi,tin) + "du_lm_dr";
        HDFAttribIO<Array<complex<double>,2> > h_u_lm_r(u_lm_r);
        HDFAttribIO<Array<complex<double>,1> > h_du_lm_dr(du_lm_dr);
        h_u_lm_r.read(H5FileID, uName.c_str());
        h_du_lm_dr.read(H5FileID, duName.c_str());
      }
      myComm->bcast(u_lm_r);
      myComm->bcast(du_lm_dr);
      myComm->bcast(k);
      double Z = (double)IonTypes(tin);
      OrbitalSet->MuffinTins[tin].set_APW (ival, k, u_lm_r, du_lm_dr, Z);
    }
  }
  //FIX HaveOrbDerivs after debugging
//	// Now read orbital derivatives if we have them
//	if (HaveOrbDerivs) {
//	  for (int ion=0; ion<IonPos.size(); ion++)
//	    for (int dim=0; dim<OHMMS_DIM; dim++) {
//	      if (root) {
//		int ti   = SortBands[iorb].TwistIndex;
//		int bi   = SortBands[iorb].BandIndex;
//
//		app_log() << "Reading orbital derivative for ion " << ion
//			  << " dim " << dim << " spin " << spin << " band "
//			  << bi << " kpoint " << ti << endl;
//		ostringstream path;
//		path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/"
//		     << "dpsi_" << ion << "_" << dim << "_r";
//		string psirName = path.str();
//		if (isComplex) {
//		  HDFAttribIO<Array<complex<double>,3> > h_rawData(rawData);
//		  h_rawData.read(H5FileID, psirName.c_str());
//		  if ((rawData.size(0) != nx) ||
//		      (rawData.size(1) != ny) ||
//		      (rawData.size(2) != nz)) {
//		    fprintf (stderr, "Error in EinsplineSetBuilder::ReadBands.\n");
//		    fprintf (stderr, "Extended orbitals should all have the same dimensions\n");
//		    abort();
//		  }
//#pragma omp parallel for
//		  for (int ix=0; ix<nx; ix++) {
//		  PosType ru;
//		    ru[0] = (RealType)ix / (RealType)nx;
//		    for (int iy=0; iy<ny; iy++) {
//		      ru[1] = (RealType)iy / (RealType)ny;
//		      for (int iz=0; iz<nz; iz++) {
//			ru[2] = (RealType)iz / (RealType)nz;
//			double phi = -2.0*M_PI*dot (ru, TwistAngles[ti]);
//			double s, c;
//			sincos(phi, &s, &c);
//			complex<double> phase(c,s);
//			complex<double> z = phase*rawData(ix,iy,iz);
//			splineData(ix,iy,iz) = z.real();
//		      }
//		    }
//		  }
//		}
//		else {
//		  HDFAttribIO<Array<double,3> >  h_splineData(splineData);
//		  h_splineData.read(H5FileID, psirName.c_str());
//		  if ((splineData.size(0) != nx) ||
//		      (splineData.size(1) != ny) ||
//		      (splineData.size(2) != nz)) {
//		    fprintf (stderr, "Error in EinsplineSetBuilder::ReadBands.\n");
//		    fprintf (stderr, "Extended orbitals should all have the same dimensions\n");
//		    abort();
//		  }
//		}
//	      }
//	      myComm->bcast(splineData);
//	      set_multi_UBspline_3d_d
//		(orbitalSet->FirstOrderSplines[ion][dim], ival, splineData.data());
//	    }
//	}
//
//
//
  orbitalSet->AtomicOrbitals = realOrbs;
  for (int i=0; i<orbitalSet->AtomicOrbitals.size(); i++)
    orbitalSet->AtomicOrbitals[i].registerTimers();
  //ExtendedMap_d[set] = orbitalSet->MultiSpline;
}
void EinsplineSetBuilder::ReadBands_ESHDF(int spin, EinsplineSetExtended<complex<double > >* orbitalSet)
{
  update_token(__FILE__,__LINE__,"ReadBands_ESHDF:complex");
  ReportEngine PRE("EinsplineSetBuilder","ReadBands_ESHDF(EinsplineSetExtended<complex<double > >*");
  Timer c_prep, c_unpack,c_fft, c_phase, c_spline, c_newphase, c_h5, c_init;
  double t_prep=0.0, t_unpack=0.0, t_fft=0.0, t_phase=0.0, t_spline=0.0, t_newphase=0.0, t_h5=0.0, t_init=0.0;
  c_prep.restart();
  bool root = myComm->rank()==0;
  vector<BandInfo>& SortBands(*FullBands[spin]);
  // bcast other stuff
  myComm->bcast (NumDistinctOrbitals);
  myComm->bcast (NumValenceOrbs);
  myComm->bcast (NumCoreOrbs);
  int N = NumDistinctOrbitals;
  orbitalSet->kPoints.resize(N);
  orbitalSet->MakeTwoCopies.resize(N);
  orbitalSet->StorageValueVector.resize(N);
  orbitalSet->BlendValueVector.resize(N);
  orbitalSet->StorageLaplVector.resize(N);
  orbitalSet->BlendLaplVector.resize(N);
  orbitalSet->StorageGradVector.resize(N);
  orbitalSet->BlendGradVector.resize(N);
  orbitalSet->StorageHessVector.resize(N);
  orbitalSet->StorageGradHessVector.resize(N);
  orbitalSet->phase.resize(N);
  orbitalSet->eikr.resize(N);
  orbitalSet->NumValenceOrbs = NumValenceOrbs;
  orbitalSet->NumCoreOrbs    = NumCoreOrbs;
  // Read in k-points
  int numOrbs = orbitalSet->getOrbitalSetSize();
  int num = 0;
  if (root)
  {
    for (int iorb=0; iorb<N; iorb++)
    {
      int ti = SortBands[iorb].TwistIndex;
      PosType twist  = TwistAngles[ti];
      orbitalSet->kPoints[iorb] = orbitalSet->PrimLattice.k_cart(twist);
      orbitalSet->MakeTwoCopies[iorb] =
        (num < (numOrbs-1)) && SortBands[iorb].MakeTwoCopies;
      num += orbitalSet->MakeTwoCopies[iorb] ? 2 : 1;
    }
  }
  myComm->bcast(orbitalSet->kPoints);
  myComm->bcast(orbitalSet->MakeTwoCopies);
  // First, check to see if we have already read this in
  H5OrbSet set(H5FileName, spin, N);
  ///check mesh or ready for FFT grid
  bool havePsig=ReadGvectors_ESHDF();
  app_log() << "MeshSize = (" << MeshSize[0] << ", " << MeshSize[1] << ", " << MeshSize[2] << ")\n";
  int nx, ny, nz, bi, ti;
  nx=MeshSize[0];
  ny=MeshSize[1];
  nz=MeshSize[2];
  Ugrid x_grid, y_grid, z_grid;
  BCtype_z xBC, yBC, zBC;
  xBC.lCode = PERIODIC;
  xBC.rCode = PERIODIC;
  yBC.lCode = PERIODIC;
  yBC.rCode = PERIODIC;
  zBC.lCode = PERIODIC;
  zBC.rCode = PERIODIC;
  x_grid.start = 0.0;
  x_grid.end = 1.0;
  x_grid.num = nx;
  y_grid.start = 0.0;
  y_grid.end = 1.0;
  y_grid.num = ny;
  z_grid.start = 0.0;
  z_grid.end = 1.0;
  z_grid.num = nz;
  // Create the multiUBspline object
  orbitalSet->MultiSpline =
    create_multi_UBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, NumValenceOrbs);
  //////////////////////////////////////
  // Create the MuffinTin APW splines //
  //////////////////////////////////////
  orbitalSet->MuffinTins.resize(NumMuffinTins);
  for (int tin=0; tin<NumMuffinTins; tin++)
  {
    orbitalSet->MuffinTins[tin].Atom = tin;
    orbitalSet->MuffinTins[tin].set_center (MT_centers[tin]);
    orbitalSet->MuffinTins[tin].set_lattice(Lattice);
    orbitalSet->MuffinTins[tin].init_APW
    (MT_APW_rgrids[tin], MT_APW_lmax[tin],
     NumValenceOrbs);
  }
  for (int iat=0; iat<AtomicOrbitals.size(); iat++)
  {
    AtomicOrbitals[iat].set_num_bands(NumValenceOrbs);
    AtomicOrbitals[iat].allocate();
  }
  int isComplex=1;
  if (root)
  {
    HDFAttribIO<int> h_isComplex(isComplex);
    h_isComplex.read(H5FileID, "/electrons/psi_r_is_complex");
  }
  myComm->bcast(isComplex);
  if (!isComplex)
  {
    APP_ABORT("Expected complex orbitals in ES-HDF file, but found real ones.");
  }
  EinsplineSetBuilder::RotateBands_ESHDF(spin, orbitalSet);
  bool isCore = bcastSortBands(spin,N,root);
  if(isCore)
  {
    APP_ABORT("Core states not supported by ES-HDF yet.");
  }
  t_prep += c_prep.elapsed();
  /** For valence orbitals,
   * - extended orbitals either in G or in R
   * - localized orbitals
   */
  //this can potentially break
  Array<ComplexType,3> splineData(nx,ny,nz);
  if(havePsig)//perform FFT using FFTW
  {
    c_init.restart();
    Array<ComplexType,3> FFTbox;
    FFTbox.resize(MeshSize[0], MeshSize[1], MeshSize[2]);
    fftw_plan FFTplan = fftw_plan_dft_3d
                        (MeshSize[0], MeshSize[1], MeshSize[2],
                         reinterpret_cast<fftw_complex*>(FFTbox.data()),
                         reinterpret_cast<fftw_complex*>(FFTbox.data()),
                         +1, FFTW_ESTIMATE);
    Vector<complex<double> > cG(MaxNumGvecs);
    //this will be parallelized with OpenMP
    for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
    {
      //Vector<complex<double> > cG;
      int ncg=0;
      int ti=SortBands[iorb].TwistIndex;
      c_h5.restart();
      if(root)
      {
        ostringstream path;
        path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex
             << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_g";
        HDFAttribIO<Vector<complex<double> > >  h_cG(cG);
        h_cG.read (H5FileID, path.str().c_str());
        ncg=cG.size();
      }
      myComm->bcast(ncg);
      if(ncg != Gvecs[0].size())
      {
        APP_ABORT("Failed : ncg != Gvecs[0].size()");
      }
      if(!root)
        cG.resize(ncg);
      myComm->bcast(cG);
      t_h5 += c_h5.elapsed();
      c_unpack.restart();
      unpack4fftw(cG,Gvecs[0],MeshSize,FFTbox);
      t_unpack+= c_unpack.elapsed();
      c_fft.restart();
      fftw_execute (FFTplan);
      t_fft+= c_fft.elapsed();
      c_phase.restart();
      fix_phase_rotate_c2c(FFTbox,splineData,TwistAngles[ti]);
      t_phase+= c_phase.elapsed();
      c_spline.restart();
      set_multi_UBspline_3d_z(orbitalSet->MultiSpline, ival, splineData.data());
      t_spline+= c_spline.elapsed();
    }
    fftw_destroy_plan(FFTplan);
    t_init+=c_init.elapsed();
  }
  else
  {
    //this will be parallelized with OpenMP
    for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
    {
      //check dimension
      if(root)
      {
        ostringstream path;
        path << "/electrons/kpoint_" << SortBands[iorb].TwistIndex
             << "/spin_" << spin << "/state_" << SortBands[iorb].BandIndex << "/psi_r";
        HDFAttribIO<Array<complex<double>,3> >  h_splineData(splineData);
        h_splineData.read(H5FileID, path.str().c_str());
      }
      myComm->bcast(splineData);
      set_multi_UBspline_3d_z(orbitalSet->MultiSpline, ival, splineData.data());
    }
    //return true;
  }
  app_log() << "    READBANDS::PREP   = " << t_prep << endl;
  app_log() << "    READBANDS::H5     = " << t_h5 << endl;
  app_log() << "    READBANDS::UNPACK = " << t_unpack << endl;
  app_log() << "    READBANDS::FFT    = " << t_fft << endl;
  app_log() << "    READBANDS::PHASE  = " << t_phase << endl;
  app_log() << "    READBANDS::SPLINE = " << t_spline << endl;
  app_log() << "    READBANDS::SUM    = " << t_init << endl;
  //now localized orbitals
  for(int iorb=0,ival=0; iorb<N; ++iorb, ++ival)
  {
    PosType twist=TwistAngles[SortBands[iorb].TwistIndex];
    // Read atomic orbital information
    for (int iat=0; iat<AtomicOrbitals.size(); iat++)
    {
      app_log() << "Reading orbital " << iat << " for band " << ival << endl;
      AtomicOrbital<complex<double> > &orb = AtomicOrbitals[iat];
      Array<complex<double>,2> radial_spline(orb.SplinePoints,orb.Numlm),
            poly_coefs(orb.PolyOrder+1,orb.Numlm);
      if (root)
      {
        int ti   = SortBands[iorb].TwistIndex;
        int bi   = SortBands[iorb].BandIndex;
        ostringstream path;
        path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_" << bi << "/";
        AtomicOrbital<complex<double> > &orb = AtomicOrbitals[iat];
        ostringstream spline_path, poly_path;
        spline_path << path.str() << "radial_spline_" << iat;
        poly_path   << path.str() << "poly_coefs_"    << iat;
        HDFAttribIO<Array<complex<double>,2> > h_radial_spline(radial_spline);
        HDFAttribIO<Array<complex<double>,2> > h_poly_coefs(poly_coefs);
        h_radial_spline.read(H5FileID, spline_path.str().c_str());
        h_poly_coefs.read   (H5FileID, poly_path.str().c_str());
        // cerr << "radial_spline.size = (" << radial_spline.size(0)
        // 	 << ", " << radial_spline.size(1) << ")\n";
        // cerr << "poly_coefs.size = (" << poly_coefs.size(0)
        // 	 << ", " << poly_coefs.size(1) << ")\n";
      }
      myComm->bcast(radial_spline);
      myComm->bcast(poly_coefs);
      AtomicOrbitals[iat].set_band (ival, radial_spline, poly_coefs, twist);
    }
    // Now read muffin tin data
    for (int tin=0; tin<NumMuffinTins; tin++)
    {
      // app_log() << "Reading data for muffin tin " << tin << endl;
      PosType twist, k;
      int lmax = MT_APW_lmax[tin];
      int numYlm = (lmax+1)*(lmax+1);
      Array<complex<double>,2>
      u_lm_r(numYlm, MT_APW_num_radial_points[tin]);
      Array<complex<double>,1> du_lm_dr (numYlm);
      if (root)
      {
        int ti   = SortBands[iorb].TwistIndex;
        int bi   = SortBands[iorb].BandIndex;
        twist = TwistAngles[ti];
        k = orbitalSet->PrimLattice.k_cart(twist);
        string uName  = MuffinTinPath (ti, bi,tin) + "u_lm_r";
        string duName = MuffinTinPath (ti, bi,tin) + "du_lm_dr";
        HDFAttribIO<Array<complex<double>,2> > h_u_lm_r(u_lm_r);
        HDFAttribIO<Array<complex<double>,1> > h_du_lm_dr(du_lm_dr);
        h_u_lm_r.read(H5FileID, uName.c_str());
        h_du_lm_dr.read(H5FileID, duName.c_str());
      }
      myComm->bcast(u_lm_r);
      myComm->bcast(du_lm_dr);
      myComm->bcast(k);
      double Z = (double)IonTypes(tin);
      OrbitalSet->MuffinTins[tin].set_APW (ival, k, u_lm_r, du_lm_dr, Z);
    }
  }
  orbitalSet->AtomicOrbitals = AtomicOrbitals;
  for (int i=0; i<orbitalSet->AtomicOrbitals.size(); i++)
    orbitalSet->AtomicOrbitals[i].registerTimers();
  //ExtendedMap_z[set] = orbitalSet->MultiSpline;
}