예제 #1
0
static void transform(ComplexMatrix &inout, const std::vector<int> &loop_dims_select, unsigned fftw_flags, bool forward)
{
	if (os::disable_SSE_for_FFTW()) {
		fftw_flags |= FFTW_UNALIGNED; // see os.h
	}

	int num_dims, num_loop_dims;
	fftw_iodim dims[16], loop_dims[16];
	make_iodims(inout.getShape(), loop_dims_select, num_dims, dims, num_loop_dims, loop_dims);

	ComplexMatrix::accessor inout_acc(inout);
	double *re = inout_acc.ptr_real();
	double *im = inout_acc.ptr_imag();

	if (!forward) {
		std::swap(re, im);
	}

	fftw_plan plan = fftw_plan_guru_split_dft(
		num_dims, dims, 
		num_loop_dims, loop_dims,
		re, im, // in
		re, im, // out
		fftw_flags
	);
	assert(plan);

	fftw_execute_split_dft(plan, re, im, re, im);

	fftw_destroy_plan(plan);
}
// Calling convention:
//  comp_filterbank(f,g,a);
void mexFunction( int UNUSED(nlhs), mxArray *plhs[],
                  int UNUSED(nrhs), const mxArray *prhs[] )
{
   static int atExitRegistered = 0;
   if(!atExitRegistered)
   {
       atExitRegistered = 1;
       mexAtExit(filterbankAtExit);
   }

   const mxArray* mxf = prhs[0];
   const mxArray* mxg = prhs[1];
   const mxArray* mxa = prhs[2];

   // input data length
   const mwSize L = mxGetM(mxf);
   const mwSize W = mxGetN(mxf);

   // filter number
   const mwSize M = mxGetNumberOfElements(mxg);

   // a col count
   mwSize acols = mxGetN(mxa);

   // pointer to a
   double *a = (double*) mxGetData(mxa);


   if (acols > 1)
   {
      int isOnes = 1;
      for (mwIndex m = 0; m < M; m++)
      {
         isOnes = isOnes && a[M + m] == 1;
      }

      if (isOnes)
      {
         acols = 1;
      }
   }

   // Cell output
   plhs[0] = mxCreateCellMatrix(M, 1);

   // Stuff for sorting the filters
   mwSize tdCount = 0;
   mwSize fftCount = 0;
   mwSize fftblCount = 0;
   mwIndex tdArgsIdx[M];
   mwIndex fftArgsIdx[M];
   mwIndex fftblArgsIdx[M];

   // WALK the filters to determine what has to be done
   for (mwIndex m = 0; m < M; m++)
   {
      mxArray * gEl = mxGetCell(mxg, m);
      if (mxGetField(gEl, 0, "h") != NULL)
      {
         tdArgsIdx[tdCount++] = m;
         continue;
      }

      if (mxGetField(gEl, 0, "H") != NULL)
      {
         if (acols == 1 && L == mxGetNumberOfElements(mxGetField(gEl, 0, "H")))
         {
            fftArgsIdx[fftCount++] = m;
            continue;
         }
         else
         {
            fftblArgsIdx[fftblCount++] = m;
            continue;
         }
      }
   }

   if (tdCount > 0)
   {
      /*
         Here, we have to reformat the inputs and pick up results to comply with:
         c=comp_filterbank_td(f,g,a,offset,ext);
         BEWARE OF THE AUTOMATIC DEALLOCATION!! by the Matlab engine.
         Arrays can be very easily freed twice causing segfaults.
         This happends particulary when using mxCreateCell* which stores
         pointers to other mxArray structs. Setting all such pointers to
         NULL after they are used seems to solve it.
      */
      mxArray* plhs_td[1];
      mxArray* prhs_td[5];
      prhs_td[0] = (mxArray*) mxf;
      prhs_td[1] = mxCreateCellMatrix(tdCount, 1);
      prhs_td[2] = mxCreateDoubleMatrix(tdCount, 1, mxREAL);
      double* aPtr = mxGetData(prhs_td[2]);
      prhs_td[3] = mxCreateDoubleMatrix(tdCount, 1, mxREAL);
      double* offsetPtr = mxGetData(prhs_td[3]);
      prhs_td[4] = mxCreateString("per");

      for (mwIndex m = 0; m < tdCount; m++)
      {
         mxArray * gEl = mxGetCell(mxg, tdArgsIdx[m]);
         mxSetCell(prhs_td[1], m, mxGetField(gEl, 0, "h"));
         // This has overhead
         //mxSetCell((mxArray*)prhs_td[1],m,mxDuplicateArray(mxGetField(gEl,0,"h")));

         aPtr[m] = a[tdArgsIdx[m]];
         offsetPtr[m] = mxGetScalar(mxGetField(gEl, 0, "offset"));
      }

      // Finally call it!
      // comp_filterbank_td(1,plhs_td,5, prhs_td);
      // This has overhead:
      mexCallMATLAB(1, plhs_td, 5, prhs_td, "comp_filterbank_td");

      // Copy pointers to a proper index in the output + unset all duplicate cell elements
      for (mwIndex m = 0; m < tdCount; m++)
      {
         mxSetCell(plhs[0], tdArgsIdx[m], mxGetCell(plhs_td[0], m));
         mxSetCell(plhs_td[0], m, NULL);
         mxSetCell(prhs_td[1], m, NULL);
      }
      mxDestroyArray(plhs_td[0]);
      mxDestroyArray(prhs_td[1]);
      mxDestroyArray(prhs_td[2]);
      mxDestroyArray(prhs_td[3]);
      mxDestroyArray(prhs_td[4]);

   }


   if (fftCount > 0 || fftblCount > 0)
   {
      // Need to do FFT of mxf
      mwIndex ndim = 2;
      const mwSize dims[] = {L, W};

      if (mxF == NULL || mxGetM(mxF) != L || mxGetN(mxF) != W || mxGetClassID(mxF) != mxGetClassID(mxf))
      {
         if (mxF != NULL)
         {
            mxDestroyArray(mxF);
            mxF = NULL;
            // printf("Should be called just once\n");
         }


         if (mxIsDouble(mxf))
         {
            mxF = mxCreateNumericArray(ndim, dims, mxDOUBLE_CLASS, mxCOMPLEX);
            fftw_iodim fftw_dims[1];
            fftw_iodim howmanydims[1];

            fftw_dims[0].n = L;
            fftw_dims[0].is = 1;
            fftw_dims[0].os = 1;

            howmanydims[0].n = W;
            howmanydims[0].is = L;
            howmanydims[0].os = L;

            if (p_double == NULL)
               p_double = (fftw_plan*) malloc(sizeof(fftw_plan));
            else
               fftw_destroy_plan(*p_double);

            // FFTW_MEASURE sometimes hangs here
            *p_double = fftw_plan_guru_split_dft(
                           1, fftw_dims,
                           1, howmanydims,
                           mxGetData(mxF), mxGetImagData(mxF), mxGetData(mxF), mxGetImagData(mxF),
                           FFTW_ESTIMATE);

         }
         else if (mxIsSingle(mxf))
         {
            mxF = mxCreateNumericArray(ndim, dims, mxSINGLE_CLASS, mxCOMPLEX);
            // mexPrintf("M= %i, N= %i\n",mxGetM(mxF),mxGetN(mxF));
            fftwf_iodim fftw_dims[1];
            fftwf_iodim howmanydims[1];

            fftw_dims[0].n = L;
            fftw_dims[0].is = 1;
            fftw_dims[0].os = 1;

            howmanydims[0].n = W;
            howmanydims[0].is = L;
            howmanydims[0].os = L;

            if (p_float == NULL)
               p_float = (fftwf_plan*) malloc(sizeof(fftwf_plan));
            else
               fftwf_destroy_plan(*p_float);

            *p_float = fftwf_plan_guru_split_dft(
                          1, fftw_dims,
                          1, howmanydims,
                          mxGetData(mxF), mxGetImagData(mxF),
                          mxGetData(mxF), mxGetImagData(mxF),
                          FFTW_ESTIMATE);

         }


      }

      if (mxIsDouble(mxf))
      {
         memcpy(mxGetPr(mxF), mxGetPr(mxf), L * W * sizeof(double));
         memset(mxGetPi(mxF), 0, L * W * sizeof(double));
         if (mxIsComplex(mxf))
            memcpy(mxGetPi(mxF), mxGetPi(mxf), L * W * sizeof(double));

         fftw_execute(*p_double);

      }
      else if (mxIsSingle(mxf))
      {
         memcpy(mxGetPr(mxF), mxGetPr(mxf), L * W * sizeof(float));
         memset(mxGetPi(mxF), 0, L * W * sizeof(float));
         if (mxIsComplex(mxf))
            memcpy(mxGetPi(mxF), mxGetPi(mxf), L * W * sizeof(float));

         fftwf_execute(*p_float);
      }
   }

   if (fftCount > 0)
   {
      mxArray* plhs_fft[1];
      mxArray* prhs_fft[3];
      prhs_fft[0] = mxF;
      prhs_fft[1] = mxCreateCellMatrix(fftCount, 1);
      prhs_fft[2] = mxCreateDoubleMatrix(fftCount, 1, mxREAL);
      double* aPtr = mxGetData(prhs_fft[2]);

      for (mwIndex m = 0; m < fftCount; m++)
      {
         mxArray * gEl = mxGetCell(mxg, fftArgsIdx[m]);
         mxSetCell(prhs_fft[1], m, mxGetField(gEl, 0, "H"));
         // This has overhead
         //mxSetCell((mxArray*)prhs_td[1],m,mxDuplicateArray(mxGetField(gEl,0,"h")));
         aPtr[m] = a[fftArgsIdx[m]];
      }

      //comp_filterbank_fft(1,plhs_fft,3, prhs_fft);
      mexCallMATLAB(1, plhs_fft, 3, prhs_fft, "comp_filterbank_fft");

      for (mwIndex m = 0; m < fftCount; m++)
      {
         mxSetCell(plhs[0], fftArgsIdx[m], mxGetCell(plhs_fft[0], m));
         mxSetCell(plhs_fft[0], m, NULL);
         mxSetCell(prhs_fft[1], m, NULL);
      }
      mxDestroyArray(plhs_fft[0]);
      mxDestroyArray(prhs_fft[1]);
      mxDestroyArray(prhs_fft[2]);
   }

   if (fftblCount > 0)
   {
      mxArray* plhs_fftbl[1];
      mxArray* prhs_fftbl[5];
      prhs_fftbl[0] = mxF;
      prhs_fftbl[1] = mxCreateCellMatrix(fftblCount, 1);
      prhs_fftbl[2] = mxCreateDoubleMatrix(fftblCount, 1, mxREAL);
      prhs_fftbl[3] = mxCreateDoubleMatrix(fftblCount, 2, mxREAL);
      prhs_fftbl[4] = mxCreateDoubleMatrix(fftblCount, 1, mxREAL);
      double* foffPtr = mxGetData(prhs_fftbl[2]);
      double* aPtr = mxGetData(prhs_fftbl[3]);
      double* realonlyPtr = mxGetData(prhs_fftbl[4]);
      // Set all realonly flags to zero
      memset(realonlyPtr, 0, fftblCount * sizeof * realonlyPtr);

      for (mwIndex m = 0; m < fftblCount; m++)
      {
         mxArray * gEl = mxGetCell(mxg, fftblArgsIdx[m]);
         mxSetCell(prhs_fftbl[1], m, mxGetField(gEl, 0, "H"));
         foffPtr[m] = mxGetScalar(mxGetField(gEl, 0, "foff"));
         aPtr[m] = a[fftblArgsIdx[m]];

         if (acols > 1)
            aPtr[m + fftblCount] = a[fftblArgsIdx[m] + M];
         else
            aPtr[m + fftblCount] = 1;

         // Only if realonly is specified
         mxArray* mxrealonly;
         if ((mxrealonly = mxGetField(gEl, 0, "realonly")))
            realonlyPtr[m] = mxGetScalar(mxrealonly);
      }

      // comp_filterbank_fftbl(1,plhs_fftbl,5, prhs_fftbl);
      mexCallMATLAB(1, plhs_fftbl, 5, prhs_fftbl, "comp_filterbank_fftbl");

      for (mwIndex m = 0; m < fftblCount; m++)
      {
         mxSetCell(plhs[0], fftblArgsIdx[m], mxGetCell(plhs_fftbl[0], m));
         mxSetCell(plhs_fftbl[0], m, NULL);
         mxSetCell(prhs_fftbl[1], m, NULL);
      }
      mxDestroyArray(plhs_fftbl[0]);
      mxDestroyArray(prhs_fftbl[1]);
      mxDestroyArray(prhs_fftbl[2]);
      mxDestroyArray(prhs_fftbl[3]);
      mxDestroyArray(prhs_fftbl[4]);
   }


   if (mxF != NULL)
      mexMakeArrayPersistent(mxF);

   if (L * W > MAXARRAYLEN && mxF != NULL)
   {
      //printf("Damn. Should not get here\n");
      mxDestroyArray(mxF);
      mxF = NULL;
   }

}
예제 #3
0
/* Here's the big banana
   Convolves two functions defined on the 2-sphere.
   Uses seminaive algorithms for spherical harmonic transforms

   size = 2*bw

   Inputs:

   rdata, idata - (size * size) arrays containing real and
                  imaginary parts of sampled function.
   rfilter, ifilter - (size * size) arrays containing real and
                      imaginary parts of sampled filter function.
   rres, ires - (size * size) arrays containing real and
                  imaginary parts of result function.


   Suggestion - if you want to do multiple convolutions,
   don't keep allocating and freeing space with every call,
   or keep recomputing the spharmonic_pml tables.
   Allocate workspace once before you call this function, then
   just set up pointers as first step of this procedure rather
   than mallocing.  And do the same with the FST, FZT, and InvFST functions.

   ASSUMPTIONS:
   1. data is strictly REAL
   2. will do semi-naive algorithm for ALL orders -> change the cutoff
      value if you want it to be different

   Memory requirements for Conv2Sphere

   Need space for spharmonic tables and local workspace and
   scratchpad space for FST_semi

   Let legendreSize = Reduced_Naive_TableSize(bw,cutoff) +
                      Reduced_SpharmonicTableSize(bw,cutoff)

   Then the workspace needs to be this large:

   2 * legendreSize  +
   8 * (bw*bw)  + 10*bw +
   4 * (bw*bw) + 2*bw

   for a total of

   2 * legendreSize  +
   12 * (bw*bw) + 12*bw ;
   


*/
void Conv2Sphere_semi_memo(double *rdata, double *idata, 
			   double *rfilter, double *ifilter, 
			   double *rres, double *ires, 
			   int bw,
			   double *workspace)
{
  int size, spharmonic_bound ;
  int legendreSize, cutoff ;
  double *frres, *fires, *filtrres, *filtires, *trres, *tires;
  double **spharmonic_pml_table, **transpose_spharmonic_pml_table;
  double *spharmonic_result_space, *transpose_spharmonic_result_space;
  double *scratchpad;

  /* fftw */
  int rank, howmany_rank ;
  fftw_iodim dims[1], howmany_dims[1];

  /* forward transform stuff */
  fftw_plan dctPlan, fftPlan ;
  double *weights ;

  /* inverse transform stuff */
  fftw_plan idctPlan, ifftPlan ;

  size =2*bw ;
  cutoff = bw ;
  legendreSize = Reduced_Naive_TableSize(bw,cutoff) +
    Reduced_SpharmonicTableSize(bw,cutoff) ;

  /* assign space */

  spharmonic_bound = legendreSize ;

  spharmonic_result_space = workspace;          /* needs legendreSize */

  transpose_spharmonic_result_space =
    spharmonic_result_space +  legendreSize ;   /* needs legendreSize */

  frres = transpose_spharmonic_result_space + 
    legendreSize ;                              /* needs (bw*bw) */
  fires = frres + (bw*bw);                      /* needs (bw*bw) */
  trres = fires + (bw*bw);                      /* needs (bw*bw) */
  tires = trres + (bw*bw);                      /* needs (bw*bw) */
  filtrres = tires + (bw*bw);                   /* needs bw */
  filtires = filtrres + bw;                     /* needs bw */
  scratchpad = filtires + bw;                   /* needs (8*bw^2)+(10*bw) */

  /* allocate space, and compute, the weights for this bandwidth */
  weights = (double *) malloc(sizeof(double) * 4 * bw);
  makeweights( bw, weights );

  /* make the fftw plans */

  /* make DCT plans -> note that I will be using the GURU
     interface to execute these plans within the routines*/

  /* forward DCT */
  dctPlan = fftw_plan_r2r_1d( 2*bw, weights, rdata,
			      FFTW_REDFT10, FFTW_ESTIMATE ) ;
      
  /* inverse DCT */
  idctPlan = fftw_plan_r2r_1d( 2*bw, weights, rdata,
			       FFTW_REDFT01, FFTW_ESTIMATE );
  
  /*
    fft "preamble" ;
    note that this plan places the output in a transposed array
  */
  rank = 1 ;
  dims[0].n = 2*bw ;
  dims[0].is = 1 ;
  dims[0].os = 2*bw ;
  howmany_rank = 1 ;
  howmany_dims[0].n = 2*bw ;
  howmany_dims[0].is = 2*bw ;
  howmany_dims[0].os = 1 ;
 
  /* forward fft */
  fftPlan = fftw_plan_guru_split_dft( rank, dims,
				      howmany_rank, howmany_dims,
				      rdata, idata,
				      workspace, workspace+(4*bw*bw),
				      FFTW_ESTIMATE );

  /*
    now plan for inverse fft - note that this plans assumes
    that I'm working with a transposed array, e.g. the inputs
    for a length 2*bw transform are placed every 2*bw apart,
    the output will be consecutive entries in the array
  */
  rank = 1 ;
  dims[0].n = 2*bw ;
  dims[0].is = 2*bw ;
  dims[0].os = 1 ;
  howmany_rank = 1 ;
  howmany_dims[0].n = 2*bw ;
  howmany_dims[0].is = 1 ;
  howmany_dims[0].os = 2*bw ;

  /* inverse fft */
  ifftPlan = fftw_plan_guru_split_dft( rank, dims,
				       howmany_rank, howmany_dims,
				       rdata, idata,
				       workspace, workspace+(4*bw*bw),
				       FFTW_ESTIMATE );


  /* precompute the associated Legendre fcts */
  spharmonic_pml_table = 
    Spharmonic_Pml_Table(bw,
			 spharmonic_result_space,
			 scratchpad);

  transpose_spharmonic_pml_table = 
    Transpose_Spharmonic_Pml_Table(spharmonic_pml_table, 
				   bw,
				   transpose_spharmonic_result_space,
				   scratchpad);
  FST_semi_memo(rdata, idata, 
		frres, fires, 
		bw, 
		spharmonic_pml_table,
		scratchpad,
		1,
		bw,
		&dctPlan,
		&fftPlan,
		weights );

  FZT_semi_memo(rfilter, ifilter, 
		filtrres, filtires, 
		bw, 
		spharmonic_pml_table[0],
		scratchpad,
		1,
		&dctPlan,
		weights );

  TransMult(frres, fires, filtrres, filtires, trres, tires, bw);

  InvFST_semi_memo(trres, tires, 
		   rres, ires, 
		   bw,
		   transpose_spharmonic_pml_table,
		   scratchpad,
		   1,
		   bw,
		   &idctPlan,
		   &ifftPlan );

  free( weights ) ;

  /***
      have to free the memory that was allocated in
      Spharmonic_Pml_Table() and
      Transpose_Spharmonic_Pml_Table()
  ***/

  free(spharmonic_pml_table);
  free(transpose_spharmonic_pml_table);

  /* destroy plans */
  fftw_destroy_plan( ifftPlan ) ;
  fftw_destroy_plan( fftPlan ) ;
  fftw_destroy_plan( idctPlan ) ;
  fftw_destroy_plan( dctPlan ) ;
}
예제 #4
0
int main(int argc, char **argv)
{
  FILE *errorsfp;
  int i, j, bw, size, loops;
  int l, m, dummy, cutoff ;
  int rank, howmany_rank ;
  double *rcoeffs, *icoeffs, *rdata, *idata, *rresult, *iresult;
  double *workspace, *weights;
  double dumx, dumy ;
  double *relerror, *curmax, granderror, grandrelerror;
  double realtmp, imagtmp,origmag, tmpmag;
  double ave_error, ave_relerror, stddev_error, stddev_relerror;
  double total_time, for_time, inv_time;
  double tstart, tstop;
  time_t seed;
  fftw_plan dctPlan, idctPlan ;
  fftw_plan fftPlan, ifftPlan ;
  fftw_iodim dims[1], howmany_dims[1];

  if (argc < 3)
    {
      fprintf(stdout,"Usage: test_s2_semi_fly bw loops [error_file]\n");
      exit(0);
    }

  bw = atoi(argv[1]);
  loops = atoi(argv[2]);

  /*** ASSUMING WILL SEMINAIVE ALL ORDERS ***/
  cutoff = bw ;

  size = 2*bw;
  total_time = 0.0;
  for_time = 0.0;
  inv_time = 0.0;
  granderror = 0.0;
  grandrelerror = 0.0; 

  /*
    allocate memory
  */

  rcoeffs = (double *) malloc(sizeof(double) * (bw * bw));
  icoeffs = (double *) malloc(sizeof(double) * (bw * bw));
  rdata = (double *) malloc(sizeof(double) * (size * size));
  idata = (double *) malloc(sizeof(double) * (size * size));
  rresult = (double *) malloc(sizeof(double) * (bw * bw));
  iresult = (double *) malloc(sizeof(double) * (bw * bw));
  workspace = (double *) malloc(sizeof(double) * 
				((10 * (bw*bw)) + 
				 (24 * bw)));

  /** space for errors **/
  relerror = (double *) malloc(sizeof(double) * loops);
  curmax = (double *) malloc(sizeof(double) * loops);

  /* make array for weights */
  weights = (double *) malloc(sizeof(double) * 4 * bw);


  /****
    At this point, check to see if all the memory has been
    allocated. If it has not, there's no point in going further.
    ****/
  if ( (rdata == NULL) || (idata == NULL) ||
       (rresult == NULL) || (iresult == NULL) ||
       (rcoeffs == NULL) || (icoeffs == NULL) ||
       (workspace == NULL) || (weights == NULL) )
    {
      perror("Error in allocating memory");
      exit( 1 ) ;
    }

  /*** generate a seed, needed to generate random data ***/
  time(&seed);
  srand48( seed );

  /*
    construct fftw plans
  */

  /* make DCT plans -> note that I will be using the GURU
     interface to execute these plans within the routines*/

  /* forward DCT */
  dctPlan = fftw_plan_r2r_1d( 2*bw, weights, rdata,
			      FFTW_REDFT10, FFTW_ESTIMATE ) ;
      
  /* inverse DCT */
  idctPlan = fftw_plan_r2r_1d( 2*bw, weights, rdata,
			       FFTW_REDFT01, FFTW_ESTIMATE );

  /*
    fftw "preamble" ;
    note that this plan places the output in a transposed array
  */
  rank = 1 ;
  dims[0].n = 2*bw ;
  dims[0].is = 1 ;
  dims[0].os = 2*bw ;
  howmany_rank = 1 ;
  howmany_dims[0].n = 2*bw ;
  howmany_dims[0].is = 2*bw ;
  howmany_dims[0].os = 1 ;
  
  /* forward fft */
  fftPlan = fftw_plan_guru_split_dft( rank, dims,
				      howmany_rank, howmany_dims,
				      rdata, idata,
				      workspace, workspace+(4*bw*bw),
				      FFTW_ESTIMATE );
  
  /*
    now plan for inverse fft - note that this plans assumes
    that I'm working with a transposed array, e.g. the inputs
    for a length 2*bw transform are placed every 2*bw apart,
    the output will be consecutive entries in the array
  */
  rank = 1 ;
  dims[0].n = 2*bw ;
  dims[0].is = 2*bw ;
  dims[0].os = 1 ;
  howmany_rank = 1 ;
  howmany_dims[0].n = 2*bw ;
  howmany_dims[0].is = 1 ;
  howmany_dims[0].os = 2*bw ;

  /* inverse fft */
  ifftPlan = fftw_plan_guru_split_dft( rank, dims,
				       howmany_rank, howmany_dims,
				       rdata, idata,
				       workspace, workspace+(4*bw*bw),
				       FFTW_ESTIMATE );

  /* now make the weights */
  makeweights( bw, weights );

  /*
    now start the looping
  */
  fprintf(stdout,"about to enter loop\n\n");
  for(i=0; i<loops; i++){

    /****
	 loop to generate spherical harmonic coefficients
	 of a real-valued function
    *****/
    for(m=0;m<bw;m++)
      for(l=m;l<bw;l++){
	dumx = 2.0 * (drand48()-0.5);
	dumy = 2.0 * (drand48()-0.5);
	dummy = seanindex(m,l,bw);
	rcoeffs[dummy] = dumx;
	icoeffs[dummy] = dumy;
	dummy = seanindex(-m,l,bw);
	rcoeffs[dummy] = ((double) pow(-1.0, (double) m)) * dumx;
	icoeffs[dummy] = ((double) pow(-1.0, (double) (m + 1))) * dumy;
      }

    /* have to zero out the m=0 coefficients, since those are real */
    for(m=0;m<bw;m++)
      icoeffs[m] = 0.0;

    /* do the inverse spherical transform */
    tstart = csecond();    

    InvFST_semi_fly(rcoeffs,icoeffs,
		    rdata, idata,
		    bw,
		    workspace,
		    1,
		    cutoff,
		    &idctPlan,
		    &ifftPlan );

    tstop = csecond();
    inv_time += (tstop - tstart);
    
    fprintf(stdout,"inv time \t = %.4e\n", tstop - tstart);
    
    /* now do the forward spherical transform */
    tstart = csecond();

    FST_semi_fly(rdata, idata,
		 rresult, iresult,
		 bw,
		 workspace,
		 1,
		 cutoff,
		 &dctPlan,
		 &fftPlan,
		 weights ) ;

    tstop = csecond();    
    for_time += (tstop - tstart);
    
    fprintf(stdout,"forward time \t = %.4e\n", tstop - tstart);

    /* now to compute the error */
    relerror[i] = 0.0;
    curmax[i] = 0.0;
    for(j=0;j<(bw*bw);j++){
      realtmp = rresult[j]-rcoeffs[j];
      imagtmp = iresult[j]-icoeffs[j];
      origmag = sqrt((rcoeffs[j]*rcoeffs[j]) + (icoeffs[j]*icoeffs[j]));
      tmpmag  = sqrt((realtmp*realtmp) + (imagtmp*imagtmp));
      relerror[i] = max(relerror[i],tmpmag/(origmag + pow(10.0, -50.0)));
      curmax[i]  = max(curmax[i],tmpmag);
    }

    fprintf(stdout,"r-o error\t = %.12f\n", curmax[i]);
    fprintf(stdout,"(r-o)/o error\t = %.12f\n\n", relerror[i]);

    granderror += curmax[i];
    grandrelerror += relerror[i];
    
  }

  total_time = inv_time + for_time;

  ave_error = granderror / ( (double) loops );
  ave_relerror = grandrelerror / ( (double) loops );
  stddev_error = 0.0 ; stddev_relerror = 0.0;
  for( i = 0 ; i < loops ; i ++ )
    {
      stddev_error += pow( ave_error - curmax[i] , 2.0 );
      stddev_relerror += pow( ave_relerror - relerror[i] , 2.0 );
    }
  /*** this won't work if loops == 1 ***/
  if( loops != 1 )
    {
      stddev_error = sqrt(stddev_error / ( (double) (loops - 1) ) );
      stddev_relerror = sqrt(stddev_relerror / ( (double) (loops - 1) ) );
    }

  fprintf(stdout,"Program: test_s2_semi_fly\n");
  fprintf(stdout,"Bandwidth = %d\n", bw);

#ifndef WALLCLOCK
  fprintf(stdout,"Total elapsed cpu time :\t\t %.4e seconds.\n",
	  total_time);
  fprintf(stdout,"Average cpu forward per iteration:\t %.4e seconds.\n",
	  for_time/((double) loops));  
  fprintf(stdout,"Average cpu inverse per iteration:\t %.4e seconds.\n",
	  inv_time/((double) loops));
#else
  fprintf(stdout,"Total elapsed wall time :\t\t %.4e seconds.\n",
	  total_time);
  fprintf(stdout,"Average wall forward per iteration:\t %.4e seconds.\n",
	  for_time/((double) loops));  
  fprintf(stdout,"Average wall inverse per iteration:\t %.4e seconds.\n",
	  inv_time/((double) loops));
#endif

  fprintf(stdout,"Average r-o error:\t\t %.4e\t",
	  granderror/((double) loops));
  fprintf(stdout,"std dev: %.4e\n",stddev_error);
  fprintf(stdout,"Average (r-o)/o error:\t\t %.4e\t",
	  grandrelerror/((double) loops));
  fprintf(stdout,"std dev: %.4e\n\n",stddev_relerror);

  if (argc == 4)
    {
      errorsfp = fopen(argv[3],"w");
      for(m = 0 ; m < bw ; m++ )
	{
	  for(l = m ; l< bw ; l++ )
	    {
	      dummy = seanindex(m,l,bw);
	      fprintf(errorsfp,
		      "dummy = %d\t m = %d\tl = %d\t%.10f  %.10f\n",
		      dummy, m, l,
		      fabs(rcoeffs[dummy] - rresult[dummy]),
		      fabs(icoeffs[dummy] - iresult[dummy]));
	      
	      dummy = seanindex(-m,l,bw);	      
	      fprintf(errorsfp,
		      "dummy = %d\t m = %d\tl = %d\t%.10f  %.10f\n",
		      dummy, -m, l,
		      fabs(rcoeffs[dummy] - rresult[dummy]),
		      fabs(icoeffs[dummy] - iresult[dummy]));
	    }
	}
      fclose(errorsfp);
    }

  /* destroy fftw plans */
  fftw_destroy_plan( ifftPlan );
  fftw_destroy_plan( fftPlan );
  fftw_destroy_plan( idctPlan );
  fftw_destroy_plan( dctPlan );

  /* free memory */
  free( weights );
  free(curmax);
  free(relerror);
  free(workspace);
  free(iresult);
  free(rresult);
  free(idata);
  free(rdata);
  free(icoeffs);
  free(rcoeffs);

  return 0 ;

}