Пример #1
0
VEC	*svd(MAT *A, MAT *U, MAT *V, VEC *d)
#endif
{
	STATIC VEC	*f=VNULL;
	int	i, limit;
	MAT	*A_tmp;

	if ( ! A )
		error(E_NULL,"svd");
	if ( ( U && ( U->m != U->n ) ) || ( V && ( V->m != V->n ) ) )
		error(E_SQUARE,"svd");
	if ( ( U && U->m != A->m ) || ( V && V->m != A->n ) )
		error(E_SIZES,"svd");

	A_tmp = m_copy(A,MNULL);
	if ( U != MNULL )
	    m_ident(U);
	if ( V != MNULL )
	    m_ident(V);
	limit = min(A_tmp->m,A_tmp->n);
	d = v_resize(d,limit);
	f = v_resize(f,limit-1);
	MEM_STAT_REG(f,TYPE_VEC);

	bifactor(A_tmp,U,V);
	if ( A_tmp->m >= A_tmp->n )
	    for ( i = 0; i < limit; i++ )
	    {
		d->ve[i] = A_tmp->me[i][i];
		if ( i+1 < limit )
		    f->ve[i] = A_tmp->me[i][i+1];
	    }
	else
	    for ( i = 0; i < limit; i++ )
	    {
		d->ve[i] = A_tmp->me[i][i];
		if ( i+1 < limit )
		    f->ve[i] = A_tmp->me[i+1][i];
	    }


	if ( A_tmp->m >= A_tmp->n )
	    bisvd(d,f,U,V);
	else
	    bisvd(d,f,V,U);

	M_FREE(A_tmp);
#ifdef	THREADSAFE
	V_FREE(f);
#endif

	return d;
}
Пример #2
0
struct fft_plan_3d *fft_3d_create_plan
(MPI_Comm comm, int nfast, int nmid, int nslow,
 int in_ilo, int in_ihi, int in_jlo, int in_jhi,
 int in_klo, int in_khi,
 int out_ilo, int out_ihi, int out_jlo, int out_jhi,
 int out_klo, int out_khi,
 int scaled, int permute, int *nbuf)
{
  struct fft_plan_3d *plan;
  int me,nprocs;
  int flag,remapflag;
  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
  int np1=0,np2=0,ip1,ip2;

  MPI_Comm_rank(comm, &me);
  MPI_Comm_size(comm, &nprocs);
  bifactor(nprocs,&np1,&np2);
  ip1 = me % np1;
  ip2 = me / np1;

  /* allocate memory for plan data struct */
  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
  if (plan == NULL) return NULL;

  /* remap from initial distribution to layout needed for 1st set of 1d FFTs
     not needed if all procs own entire fast axis initially
     first indices = distribution after 1st set of FFTs */

  if (in_ilo == 0 && in_ihi == nfast-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    first_ilo = in_ilo;
    first_ihi = in_ihi;
    first_jlo = in_jlo;
    first_jhi = in_jhi;
    first_klo = in_klo;
    first_khi = in_khi;
    plan->pre_plan = NULL;
  }
  else {
    first_ilo = 0;
    first_ihi = nfast - 1;
    first_jlo = ip1*nmid/np1;
    first_jhi = (ip1+1)*nmid/np1 - 1;
    first_klo = ip2*nslow/np2;
    first_khi = (ip2+1)*nslow/np2 - 1;
    plan->pre_plan =
      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,
                           FFT_PRECISION,0,0,2);
    if (plan->pre_plan == NULL) return NULL;
  }

  /* 1d FFTs along fast axis */

  plan->length1 = nfast;
  plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);

  /* remap from 1st to 2nd FFT
     choose which axis is split over np1 vs np2 to minimize communication
     second indices = distribution after 2nd set of FFTs */

  second_ilo = ip1*nfast/np1;
  second_ihi = (ip1+1)*nfast/np1 - 1;
  second_jlo = 0;
  second_jhi = nmid - 1;
  second_klo = ip2*nslow/np2;
  second_khi = (ip2+1)*nslow/np2 - 1;
  plan->mid1_plan =
    remap_3d_create_plan(comm,
                         first_ilo,first_ihi,first_jlo,first_jhi,
                         first_klo,first_khi,
                         second_ilo,second_ihi,second_jlo,second_jhi,
                         second_klo,second_khi,
                         FFT_PRECISION,1,0,2);
  if (plan->mid1_plan == NULL) return NULL;

  /* 1d FFTs along mid axis */

  plan->length2 = nmid;
  plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);

  /* remap from 2nd to 3rd FFT
     if final distribution is permute=2 with all procs owning entire slow axis
     then this remapping goes directly to final distribution
     third indices = distribution after 3rd set of FFTs */

  if (permute == 2 && out_klo == 0 && out_khi == nslow-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    third_ilo = out_ilo;
    third_ihi = out_ihi;
    third_jlo = out_jlo;
    third_jhi = out_jhi;
    third_klo = out_klo;
    third_khi = out_khi;
  }
  else {
    third_ilo = ip1*nfast/np1;
    third_ihi = (ip1+1)*nfast/np1 - 1;
    third_jlo = ip2*nmid/np2;
    third_jhi = (ip2+1)*nmid/np2 - 1;
    third_klo = 0;
    third_khi = nslow - 1;
  }

  plan->mid2_plan =
    remap_3d_create_plan(comm,
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,
                         FFT_PRECISION,1,0,2);
  if (plan->mid2_plan == NULL) return NULL;

  /* 1d FFTs along slow axis */

  plan->length3 = nslow;
  plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;

  /* remap from 3rd FFT to final distribution
     not needed if permute = 2 and third indices = out indices on all procs */

  if (permute == 2 &&
      out_ilo == third_ilo && out_ihi == third_ihi &&
      out_jlo == third_jlo && out_jhi == third_jhi &&
      out_klo == third_klo && out_khi == third_khi)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0)
    plan->post_plan = NULL;
  else {
    plan->post_plan =
      remap_3d_create_plan(comm,
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,
                           FFT_PRECISION,(permute+1)%3,0,2);
    if (plan->post_plan == NULL) return NULL;
  }

  /* configure plan memory pointers and allocate work space
     out_size = amount of memory given to FFT by user
     first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
     copy_size = amount needed internally for extra copy of data
     scratch_size = amount needed internally for remap scratch space
     for each remap:
     use out space for result if big enough, else require copy buffer
     accumulate largest required remap scratch space */

  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
    (first_khi-first_klo+1);
  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
    (second_khi-second_klo+1);
  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
    (third_khi-third_klo+1);

  copy_size = 0;
  scratch_size = 0;

  if (plan->pre_plan) {
    if (first_size <= out_size)
      plan->pre_target = 0;
    else {
      plan->pre_target = 1;
      copy_size = MAX(copy_size,first_size);
    }
    scratch_size = MAX(scratch_size,first_size);
  }

  if (plan->mid1_plan) {
    if (second_size <= out_size)
      plan->mid1_target = 0;
    else {
      plan->mid1_target = 1;
      copy_size = MAX(copy_size,second_size);
    }
    scratch_size = MAX(scratch_size,second_size);
  }

  if (plan->mid2_plan) {
    if (third_size <= out_size)
      plan->mid2_target = 0;
    else {
      plan->mid2_target = 1;
      copy_size = MAX(copy_size,third_size);
    }
    scratch_size = MAX(scratch_size,third_size);
  }

  if (plan->post_plan)
    scratch_size = MAX(scratch_size,out_size);

  *nbuf = copy_size + scratch_size;

  if (copy_size) {
    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
    if (plan->copy == NULL) return NULL;
  }
  else plan->copy = NULL;

  if (scratch_size) {
    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
    if (plan->scratch == NULL) return NULL;
  }
  else plan->scratch = NULL;

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
  return plan;
}
Пример #3
0
struct fft_plan_3d *fft_3d_create_plan(
       MPI_Comm comm, int nfast, int nmid, int nslow,
       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
       int in_klo, int in_khi,
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_klo, int out_khi,
       int scaled, int permute, int *nbuf)
{
  struct fft_plan_3d *plan;
  int me,nprocs;
  int i,num,flag,remapflag,fftflag;
  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
  int np1,np2,ip1,ip2;
  int list[50];

  // system specific variables

#ifdef FFT_SCSL
  FFT_DATA dummy_d[5];
  FFT_PREC dummy_p[5];
  int isign,isys;
  FFT_PREC scalef;
#endif
#ifdef FFT_INTEL
  FFT_DATA dummy;
#endif
#ifdef FFT_T3E
  FFT_DATA dummy[5];
  int isign,isys;
  double scalef;
#endif

  // query MPI info

  MPI_Comm_rank(comm,&me);
  MPI_Comm_size(comm,&nprocs);

  // compute division of procs in 2 dimensions not on-processor

  bifactor(nprocs,&np1,&np2);
  ip1 = me % np1;
  ip2 = me/np1;

  // allocate memory for plan data struct

  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
  if (plan == NULL) return NULL;

  // remap from initial distribution to layout needed for 1st set of 1d FFTs
  // not needed if all procs own entire fast axis initially
  // first indices = distribution after 1st set of FFTs

  if (in_ilo == 0 && in_ihi == nfast-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    first_ilo = in_ilo;
    first_ihi = in_ihi;
    first_jlo = in_jlo;
    first_jhi = in_jhi;
    first_klo = in_klo;
    first_khi = in_khi;
    plan->pre_plan = NULL;
  }
  else {
    first_ilo = 0;
    first_ihi = nfast - 1;
    first_jlo = ip1*nmid/np1;
    first_jhi = (ip1+1)*nmid/np1 - 1;
    first_klo = ip2*nslow/np2;
    first_khi = (ip2+1)*nslow/np2 - 1;
    plan->pre_plan =
      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,2,0,0,FFT_PRECISION);
    if (plan->pre_plan == NULL) return NULL;
  }

  // 1d FFTs along fast axis

  plan->length1 = nfast;
  plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);

  // remap from 1st to 2nd FFT
  // choose which axis is split over np1 vs np2 to minimize communication
  // second indices = distribution after 2nd set of FFTs

  second_ilo = ip1*nfast/np1;
  second_ihi = (ip1+1)*nfast/np1 - 1;
  second_jlo = 0;
  second_jhi = nmid - 1;
  second_klo = ip2*nslow/np2;
  second_khi = (ip2+1)*nslow/np2 - 1;
  plan->mid1_plan =
      remap_3d_create_plan(comm,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_klo,second_khi,2,1,0,FFT_PRECISION);
  if (plan->mid1_plan == NULL) return NULL;

  // 1d FFTs along mid axis

  plan->length2 = nmid;
  plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);

  // remap from 2nd to 3rd FFT
  // if final distribution is permute=2 with all procs owning entire slow axis
  //   then this remapping goes directly to final distribution
  //  third indices = distribution after 3rd set of FFTs

  if (permute == 2 && out_klo == 0 && out_khi == nslow-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    third_ilo = out_ilo;
    third_ihi = out_ihi;
    third_jlo = out_jlo;
    third_jhi = out_jhi;
    third_klo = out_klo;
    third_khi = out_khi;
  }
  else {
    third_ilo = ip1*nfast/np1;
    third_ihi = (ip1+1)*nfast/np1 - 1;
    third_jlo = ip2*nmid/np2;
    third_jhi = (ip2+1)*nmid/np2 - 1;
    third_klo = 0;
    third_khi = nslow - 1;
  }

  plan->mid2_plan =
    remap_3d_create_plan(comm,
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION);
  if (plan->mid2_plan == NULL) return NULL;

  // 1d FFTs along slow axis

  plan->length3 = nslow;
  plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;

  // remap from 3rd FFT to final distribution
  //  not needed if permute = 2 and third indices = out indices on all procs

  if (permute == 2 &&
      out_ilo == third_ilo && out_ihi == third_ihi &&
      out_jlo == third_jlo && out_jhi == third_jhi &&
      out_klo == third_klo && out_khi == third_khi)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0)
    plan->post_plan = NULL;
  else {
    plan->post_plan =
      remap_3d_create_plan(comm,
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION);
    if (plan->post_plan == NULL) return NULL;
  }

  // configure plan memory pointers and allocate work space
  // out_size = amount of memory given to FFT by user
  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
  // copy_size = amount needed internally for extra copy of data
  // scratch_size = amount needed internally for remap scratch space
  // for each remap:
  //   out space used for result if big enough, else require copy buffer
  //   accumulate largest required remap scratch space

  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
    (first_khi-first_klo+1);
  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
    (second_khi-second_klo+1);
  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
    (third_khi-third_klo+1);

  copy_size = 0;
  scratch_size = 0;

  if (plan->pre_plan) {
    if (first_size <= out_size)
      plan->pre_target = 0;
    else {
      plan->pre_target = 1;
      copy_size = MAX(copy_size,first_size);
    }
    scratch_size = MAX(scratch_size,first_size);
  }

  if (plan->mid1_plan) {
    if (second_size <= out_size)
      plan->mid1_target = 0;
    else {
      plan->mid1_target = 1;
      copy_size = MAX(copy_size,second_size);
    }
    scratch_size = MAX(scratch_size,second_size);
  }

  if (plan->mid2_plan) {
    if (third_size <= out_size)
      plan->mid2_target = 0;
    else {
      plan->mid2_target = 1;
      copy_size = MAX(copy_size,third_size);
    }
    scratch_size = MAX(scratch_size,third_size);
  }

  if (plan->post_plan)
    scratch_size = MAX(scratch_size,out_size);

  *nbuf = copy_size + scratch_size;

  if (copy_size) {
    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
    if (plan->copy == NULL) return NULL;
  }
  else plan->copy = NULL;

  if (scratch_size) {
    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
    if (plan->scratch == NULL) return NULL;
  }
  else plan->scratch = NULL;

  // system specific pre-computation of 1d FFT coeffs
  // and scaling normalization

#if defined(FFT_SGI)

  plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  FFT_1D_INIT(nfast,plan->coeff1);
  FFT_1D_INIT(nmid,plan->coeff2);
  FFT_1D_INIT(nslow,plan->coeff3);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_SCSL)

  plan->coeff1 = (FFT_PREC *) malloc((2*nfast+30)*sizeof(FFT_PREC));
  plan->coeff2 = (FFT_PREC *) malloc((2*nmid+30)*sizeof(FFT_PREC));
  plan->coeff3 = (FFT_PREC *) malloc((2*nslow+30)*sizeof(FFT_PREC));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  plan->work1 = (FFT_PREC *) malloc((2*nfast)*sizeof(FFT_PREC));
  plan->work2 = (FFT_PREC *) malloc((2*nmid)*sizeof(FFT_PREC));
  plan->work3 = (FFT_PREC *) malloc((2*nslow)*sizeof(FFT_PREC));

  if (plan->work1 == NULL || plan->work2 == NULL ||
      plan->work3 == NULL) return NULL;

  isign = 0;
  scalef = 1.0;
  isys = 0;

  FFT_1D_INIT(isign,nfast,scalef,dummy_d,dummy_d,plan->coeff1,dummy_p,&isys);
  FFT_1D_INIT(isign,nmid,scalef,dummy_d,dummy_d,plan->coeff2,dummy_p,&isys);
  FFT_1D_INIT(isign,nslow,scalef,dummy_d,dummy_d,plan->coeff3,dummy_p,&isys);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_ACML)

  plan->coeff1 = (FFT_DATA *) malloc((3*nfast+100)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((3*nmid+100)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((3*nslow+100)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  int isign = 100;
  int isys = 1;
  int info = 0;
  FFT_DATA *dummy = NULL;

  FFT_1D(&isign,&isys,&nfast,dummy,plan->coeff1,&info);
  FFT_1D(&isign,&isys,&nmid,dummy,plan->coeff2,&info);
  FFT_1D(&isign,&isys,&nslow,dummy,plan->coeff3,&info);

  if (scaled == 0) {
    plan->scaled = 0;
    plan->norm = sqrt(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  } else {
    plan->scaled = 1;
    plan->norm = sqrt(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_INTEL)

  flag = 0;

  num = 0;
  factor(nfast,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
  num = 0;
  factor(nmid,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
  num = 0;
  factor(nslow,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;

  MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm);
  if (fftflag) {
    if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n");
    return NULL;
  }

  plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  flag = 0;
  FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1);
  FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2);
  FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3);

  if (scaled == 0) {
    plan->scaled = 1;
    plan->norm = nfast*nmid*nslow;
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
  else
    plan->scaled = 0;

#elif defined(FFT_MKL)
  DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nfast);
  DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total1/nfast);
  DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
  DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
  DftiCommitDescriptor(plan->handle_fast);

  DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nmid);
  DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total2/nmid);
  DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
  DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
  DftiCommitDescriptor(plan->handle_mid);

  DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nslow);
  DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total3/nslow);
  DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
  DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
  DftiCommitDescriptor(plan->handle_slow);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_DEC)

  if (scaled == 0) {
    plan->scaled = 1;
    plan->norm = nfast*nmid*nslow;
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
  else
    plan->scaled = 0;

#elif defined(FFT_T3E)

  plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double));
  plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double));
  plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  plan->work1 = (double *) malloc((8*nfast)*sizeof(double));
  plan->work2 = (double *) malloc((8*nmid)*sizeof(double));
  plan->work3 = (double *) malloc((8*nslow)*sizeof(double));

  if (plan->work1 == NULL || plan->work2 == NULL ||
      plan->work3 == NULL) return NULL;

  isign = 0;
  scalef = 1.0;
  isys = 0;

  FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys);
  FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys);
  FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_FFTW2)

  plan->plan_fast_forward =
    fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  plan->plan_fast_backward =
    fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);

  if (nmid == nfast) {
    plan->plan_mid_forward = plan->plan_fast_forward;
    plan->plan_mid_backward = plan->plan_fast_backward;
  }
  else {
    plan->plan_mid_forward =
      fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    plan->plan_mid_backward =
      fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  }

  if (nslow == nfast) {
    plan->plan_slow_forward = plan->plan_fast_forward;
    plan->plan_slow_backward = plan->plan_fast_backward;
  }
  else if (nslow == nmid) {
    plan->plan_slow_forward = plan->plan_mid_forward;
    plan->plan_slow_backward = plan->plan_mid_backward;
  }
  else {
    plan->plan_slow_forward =
      fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    plan->plan_slow_backward =
      fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  }

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_FFTW3)
  plan->plan_fast_forward =
    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
                            NULL,&nfast,1,plan->length1,
                            NULL,&nfast,1,plan->length1,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_fast_backward =
    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
                            NULL,&nfast,1,plan->length1,
                            NULL,&nfast,1,plan->length1,
                            FFTW_BACKWARD,FFTW_ESTIMATE);
  plan->plan_mid_forward =
    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
                            NULL,&nmid,1,plan->length2,
                            NULL,&nmid,1,plan->length2,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_mid_backward =
    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
                            NULL,&nmid,1,plan->length2,
                            NULL,&nmid,1,plan->length2,
                            FFTW_BACKWARD,FFTW_ESTIMATE);
  plan->plan_slow_forward =
    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
                            NULL,&nslow,1,plan->length3,
                            NULL,&nslow,1,plan->length3,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_slow_backward =
    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
                            NULL,&nslow,1,plan->length3,
                            NULL,&nslow,1,plan->length3,
                            FFTW_BACKWARD,FFTW_ESTIMATE);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
#else
  plan->cfg_fast_forward = kiss_fft_alloc(nfast,0,NULL,NULL);
  plan->cfg_fast_backward = kiss_fft_alloc(nfast,1,NULL,NULL);

  if (nmid == nfast) {
    plan->cfg_mid_forward = plan->cfg_fast_forward;
    plan->cfg_mid_backward = plan->cfg_fast_backward;
  }
  else {
    plan->cfg_mid_forward = kiss_fft_alloc(nmid,0,NULL,NULL);
    plan->cfg_mid_backward = kiss_fft_alloc(nmid,1,NULL,NULL);
  }

  if (nslow == nfast) {
    plan->cfg_slow_forward = plan->cfg_fast_forward;
    plan->cfg_slow_backward = plan->cfg_fast_backward;
  }
  else if (nslow == nmid) {
    plan->cfg_slow_forward = plan->cfg_mid_forward;
    plan->cfg_slow_backward = plan->cfg_mid_backward;
  }
  else {
    plan->cfg_slow_forward = kiss_fft_alloc(nslow,0,NULL,NULL);
    plan->cfg_slow_backward = kiss_fft_alloc(nslow,1,NULL,NULL);
  }

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#endif

  return plan;
}
Пример #4
0
struct fft_plan_3d *fft_3d_create_plan(
    MPI_Comm comm, int nfast, int nmid, int nslow,
    int in_ilo, int in_ihi, int in_jlo, int in_jhi,
    int in_klo, int in_khi,
    int out_ilo, int out_ihi, int out_jlo, int out_jhi,
    int out_klo, int out_khi,
    int scaled, int permute, int *nbuf)

{
    struct fft_plan_3d *plan;
    int me,nprocs;
    int i,num,flag,remapflag,fftflag;
    int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
    int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
    int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
    int out_size,first_size,second_size,third_size,copy_size,scratch_size;
    int np1,np2,ip1,ip2;
    int list[50];

    /* system specific variables */

#ifdef FFT_INTEL
    FFT_DATA dummy;
#endif
#ifdef FFT_T3E
    FFT_DATA dummy[5];
    int isign,isys;
    double scalef;
#endif

    /* query MPI info */

    MPI_Comm_rank(comm,&me);
    MPI_Comm_size(comm,&nprocs);

    /* compute division of procs in 2 dimensions not on-processor */

    bifactor(nprocs,&np1,&np2);
    ip1 = me % np1;
    ip2 = me/np1;

    /* allocate memory for plan data struct */

    plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
    if (plan == NULL) return NULL;

    /* remap from initial distribution to layout needed for 1st set of 1d FFTs
       not needed if all procs own entire fast axis initially
       first indices = distribution after 1st set of FFTs */

    if (in_ilo == 0 && in_ihi == nfast-1)
        flag = 0;
    else
        flag = 1;

    MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

    if (remapflag == 0) {
        first_ilo = in_ilo;
        first_ihi = in_ihi;
        first_jlo = in_jlo;
        first_jhi = in_jhi;
        first_klo = in_klo;
        first_khi = in_khi;
        plan->pre_plan = NULL;
    }
    else {
        first_ilo = 0;
        first_ihi = nfast - 1;
        first_jlo = ip1*nmid/np1;
        first_jhi = (ip1+1)*nmid/np1 - 1;
        first_klo = ip2*nslow/np2;
        first_khi = (ip2+1)*nslow/np2 - 1;
        plan->pre_plan =
            remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                                 first_ilo,first_ihi,first_jlo,first_jhi,
                                 first_klo,first_khi,
                                 FFT_PRECISION,0,0,2);
        if (plan->pre_plan == NULL) return NULL;
    }

    /* 1d FFTs along fast axis */

    plan->length1 = nfast;
    plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);

    /* remap from 1st to 2nd FFT
       choose which axis is split over np1 vs np2 to minimize communication
       second indices = distribution after 2nd set of FFTs */

    second_ilo = ip1*nfast/np1;
    second_ihi = (ip1+1)*nfast/np1 - 1;
    second_jlo = 0;
    second_jhi = nmid - 1;
    second_klo = ip2*nslow/np2;
    second_khi = (ip2+1)*nslow/np2 - 1;
    plan->mid1_plan =
        remap_3d_create_plan(comm,
                             first_ilo,first_ihi,first_jlo,first_jhi,
                             first_klo,first_khi,
                             second_ilo,second_ihi,second_jlo,second_jhi,
                             second_klo,second_khi,
                             FFT_PRECISION,1,0,2);
    if (plan->mid1_plan == NULL) return NULL;

    /* 1d FFTs along mid axis */

    plan->length2 = nmid;
    plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);

    /* remap from 2nd to 3rd FFT
       if final distribution is permute=2 with all procs owning entire slow axis
         then this remapping goes directly to final distribution
       third indices = distribution after 3rd set of FFTs */

    if (permute == 2 && out_klo == 0 && out_khi == nslow-1)
        flag = 0;
    else
        flag = 1;

    MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

    if (remapflag == 0) {
        third_ilo = out_ilo;
        third_ihi = out_ihi;
        third_jlo = out_jlo;
        third_jhi = out_jhi;
        third_klo = out_klo;
        third_khi = out_khi;
    }
    else {
        third_ilo = ip1*nfast/np1;
        third_ihi = (ip1+1)*nfast/np1 - 1;
        third_jlo = ip2*nmid/np2;
        third_jhi = (ip2+1)*nmid/np2 - 1;
        third_klo = 0;
        third_khi = nslow - 1;
    }

    plan->mid2_plan =
        remap_3d_create_plan(comm,
                             second_jlo,second_jhi,second_klo,second_khi,
                             second_ilo,second_ihi,
                             third_jlo,third_jhi,third_klo,third_khi,
                             third_ilo,third_ihi,
                             FFT_PRECISION,1,0,2);
    if (plan->mid2_plan == NULL) return NULL;

    /* 1d FFTs along slow axis */

    plan->length3 = nslow;
    plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;

    /* remap from 3rd FFT to final distribution
       not needed if permute = 2 and third indices = out indices on all procs */

    if (permute == 2 &&
            out_ilo == third_ilo && out_ihi == third_ihi &&
            out_jlo == third_jlo && out_jhi == third_jhi &&
            out_klo == third_klo && out_khi == third_khi)
        flag = 0;
    else
        flag = 1;

    MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

    if (remapflag == 0)
        plan->post_plan = NULL;
    else {
        plan->post_plan =
            remap_3d_create_plan(comm,
                                 third_klo,third_khi,third_ilo,third_ihi,
                                 third_jlo,third_jhi,
                                 out_klo,out_khi,out_ilo,out_ihi,
                                 out_jlo,out_jhi,
                                 FFT_PRECISION,(permute+1)%3,0,2);
        if (plan->post_plan == NULL) return NULL;
    }

    /* configure plan memory pointers and allocate work space
       out_size = amount of memory given to FFT by user
       first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
       copy_size = amount needed internally for extra copy of data
       scratch_size = amount needed internally for remap scratch space
       for each remap:
         use out space for result if big enough, else require copy buffer
         accumulate largest required remap scratch space */

    out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
    first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
                 (first_khi-first_klo+1);
    second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
                  (second_khi-second_klo+1);
    third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
                 (third_khi-third_klo+1);

    copy_size = 0;
    scratch_size = 0;

    if (plan->pre_plan) {
        if (first_size <= out_size)
            plan->pre_target = 0;
        else {
            plan->pre_target = 1;
            copy_size = MAX(copy_size,first_size);
        }
        scratch_size = MAX(scratch_size,first_size);
    }

    if (plan->mid1_plan) {
        if (second_size <= out_size)
            plan->mid1_target = 0;
        else {
            plan->mid1_target = 1;
            copy_size = MAX(copy_size,second_size);
        }
        scratch_size = MAX(scratch_size,second_size);
    }

    if (plan->mid2_plan) {
        if (third_size <= out_size)
            plan->mid2_target = 0;
        else {
            plan->mid2_target = 1;
            copy_size = MAX(copy_size,third_size);
        }
        scratch_size = MAX(scratch_size,third_size);
    }

    if (plan->post_plan)
        scratch_size = MAX(scratch_size,out_size);

    *nbuf = copy_size + scratch_size;

    if (copy_size) {
        plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
        if (plan->copy == NULL) return NULL;
    }
    else plan->copy = NULL;

    if (scratch_size) {
        plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
        if (plan->scratch == NULL) return NULL;
    }
    else plan->scratch = NULL;

    /* system specific pre-computation of 1d FFT coeffs
       and scaling normalization */

#ifdef FFT_SGI

    plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA));
    plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA));
    plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA));

    if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
            plan->coeff3 == NULL) return NULL;

    FFT_1D_INIT(nfast,plan->coeff1);
    FFT_1D_INIT(nmid,plan->coeff2);
    FFT_1D_INIT(nslow,plan->coeff3);

    if (scaled == 0)
        plan->scaled = 0;
    else {
        plan->scaled = 1;
        plan->norm = 1.0/(nfast*nmid*nslow);
        plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
                        (out_khi-out_klo+1);
    }

#endif

#ifdef FFT_INTEL

    flag = 0;

    num = 0;
    factor(nfast,&num,list);
    for (i = 0; i < num; i++)
        if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
    num = 0;
    factor(nmid,&num,list);
    for (i = 0; i < num; i++)
        if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
    num = 0;
    factor(nslow,&num,list);
    for (i = 0; i < num; i++)
        if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;

    MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm);
    if (fftflag) {
        if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n");
        return NULL;
    }

    plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA));
    plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA));
    plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA));

    if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
            plan->coeff3 == NULL) return NULL;

    flag = 0;
    FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1);
    FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2);
    FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3);

    if (scaled == 0) {
        plan->scaled = 1;
        plan->norm = nfast*nmid*nslow;
        plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
                        (out_khi-out_klo+1);
    }
    else
        plan->scaled = 0;

#endif

#ifdef FFT_DEC

    if (scaled == 0) {
        plan->scaled = 1;
        plan->norm = nfast*nmid*nslow;
        plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
                        (out_khi-out_klo+1);
    }
    else
        plan->scaled = 0;

#endif

#ifdef FFT_T3E

    plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double));
    plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double));
    plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double));

    if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
            plan->coeff3 == NULL) return NULL;

    plan->work1 = (double *) malloc((8*nfast)*sizeof(double));
    plan->work2 = (double *) malloc((8*nmid)*sizeof(double));
    plan->work3 = (double *) malloc((8*nslow)*sizeof(double));

    if (plan->work1 == NULL || plan->work2 == NULL ||
            plan->work3 == NULL) return NULL;

    isign = 0;
    scalef = 1.0;
    isys = 0;

    FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys);
    FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys);
    FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys);

    if (scaled == 0)
        plan->scaled = 0;
    else {
        plan->scaled = 1;
        plan->norm = 1.0/(nfast*nmid*nslow);
        plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
                        (out_khi-out_klo+1);
    }

#endif

#ifdef FFT_FFTW

    plan->plan_fast_forward =
        fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    plan->plan_fast_backward =
        fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);

    if (nmid == nfast) {
        plan->plan_mid_forward = plan->plan_fast_forward;
        plan->plan_mid_backward = plan->plan_fast_backward;
    }
    else {
        plan->plan_mid_forward =
            fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
        plan->plan_mid_backward =
            fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    }

    if (nslow == nfast) {
        plan->plan_slow_forward = plan->plan_fast_forward;
        plan->plan_slow_backward = plan->plan_fast_backward;
    }
    else if (nslow == nmid) {
        plan->plan_slow_forward = plan->plan_mid_forward;
        plan->plan_slow_backward = plan->plan_mid_backward;
    }
    else {
        plan->plan_slow_forward =
            fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
        plan->plan_slow_backward =
            fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    }

    if (scaled == 0)
        plan->scaled = 0;
    else {
        plan->scaled = 1;
        plan->norm = 1.0/(nfast*nmid*nslow);
        plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
                        (out_khi-out_klo+1);
    }

#endif

    return plan;
}