Пример #1
0
void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, 
					  unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
{
	FFTSetup plan_vdsp;
	DSPSplitComplex out_vdsp;
	FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
	
	unsigned int i, j, k;
	unsigned int stride;
	unsigned int log2Nx = (unsigned int) log2(n.x);
	unsigned int log2Ny = (unsigned int) log2(n.y);
	unsigned int log2Nz = (unsigned int) log2(n.z);
	unsigned int log2N;
	
	log2N = log2Nx;
	log2N = log2N > log2Ny ? log2N : log2Ny;
	log2N = log2N > log2Nz ? log2N : log2Nz;
	
	plan_vdsp = vDSP_create_fftsetup(log2N, 2);
	
	switch(dim)
	{
		case clFFT_1D:
			
			for(i = 0; i < batchSize; i++)
			{
				stride = i * n.x;
				out_vdsp.realp  = out->real  + stride;
				out_vdsp.imagp  = out->imag  + stride;
				
			    vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
			}
			break;
			
		case clFFT_2D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					stride = j * n.x + i * n.x * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.x; j++)
				{
					stride = j + i * n.x  * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
				}
			}
			break;
			
		case clFFT_3D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.y; k++)
					{
						stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
					}
				}
			}
			break;
	}
	
	vDSP_destroy_fftsetup(plan_vdsp);
}
/*	Demonstrate the complex one-dimensional in-place FFT, vDSP_fft_zip.

	The in-place FFT writes results into the same array that contains the
	input data.  This may be faster than an out-of-place routine because it
	uses less memory (so there is less data to load from memory and a
	greater chance of keeping data in cache).
*/
static void DemonstratevDSP_fft_zip(FFTSetup Setup)
{
	/*	Define a stride for the array be passed to the FFT.  In many
		applications, the stride is one and is passed to the vDSP
		routine as a constant.
	*/
	const vDSP_Stride SignalStride = 1;

	// Define a variable for a loop iterator.
	vDSP_Length i;

	// Define some variables used to time the routine.
	ClockData t0, t1;
	double Time;

	printf("\n\tOne-dimensional complex FFT of %lu elements.\n",
		(unsigned long) N);

	// Allocate memory for the arrays.
	DSPSplitComplex Signal;
	Signal.realp = malloc(N * SignalStride * sizeof Signal.realp);
	Signal.imagp = malloc(N * SignalStride * sizeof Signal.imagp);

	if (Signal.realp == NULL || Signal.imagp == NULL)
	{
		fprintf(stderr, "Error, failed to allocate memory.\n");
		exit(EXIT_FAILURE);
	}

	/*	Generate an input signal.  In a real application, data would of
		course be provided from an image file, sensors, or other source.
	*/
	const float Frequency0 = 400, Frequency1 = 623, Frequency2 = 931;
	const float Phase0 = .618, Phase1 = .7f, Phase2 = .125;
	for (i = 0; i < N; ++i)
	{
		Signal.realp[i*SignalStride] =
			  cos((i * Frequency0 / N + Phase0) * TwoPi)
			+ cos((i * Frequency1 / N + Phase1) * TwoPi)
			+ cos((i * Frequency2 / N + Phase2) * TwoPi);
		Signal.imagp[i*SignalStride] =
			  sin((i * Frequency0 / N + Phase0) * TwoPi)
			+ sin((i * Frequency1 / N + Phase1) * TwoPi)
			+ sin((i * Frequency2 / N + Phase2) * TwoPi);
	}

	// Perform an FFT.
	vDSP_fft_zip(Setup, &Signal, SignalStride, Log2N, FFT_FORWARD);

	/*	Prepare expected results based on analytical transformation of
		the input signal.
	*/
	DSPSplitComplex Expected;
	Expected.realp = malloc(N * sizeof Expected.realp);
	Expected.imagp = malloc(N * sizeof Expected.imagp);

	if (Expected.realp == NULL || Expected.imagp == NULL)
	{
		fprintf(stderr, "Error, failed to allocate memory.\n");
		exit(EXIT_FAILURE);
	}

	for (i = 0; i < N; ++i)
		Expected.realp[i] = Expected.imagp[i] = 0;

	// Add the frequencies in the signal to the expected results.
	Expected.realp[(int) Frequency0] = N * cos(Phase0 * TwoPi);
	Expected.imagp[(int) Frequency0] = N * sin(Phase0 * TwoPi);

	Expected.realp[(int) Frequency1] = N * cos(Phase1 * TwoPi);
	Expected.imagp[(int) Frequency1] = N * sin(Phase1 * TwoPi);

	Expected.realp[(int) Frequency2] = N * cos(Phase2 * TwoPi);
	Expected.imagp[(int) Frequency2] = N * sin(Phase2 * TwoPi);

	// Compare the observed results to the expected results.
	CompareComplexVectors(Expected, Signal, N);

	// Release memory.
	free(Expected.realp);
	free(Expected.imagp);

	/*	The above shows how to use the vDSP_fft_zip routine.  Now we
		will see how fast it is.
	*/

	/*	Zero the signal before timing because repeated FFTs on non-zero
		data can cause abnormalities such as infinities, NaNs, and
		subnormal numbers.
	*/
	for (i = 0; i < N; ++i)
		Signal.realp[i] = Signal.imagp[i] = 0;

	// Time vDSP_fft_zip by itself.

	t0 = Clock();

	for (i = 0; i < Iterations; ++i)
		vDSP_fft_zip(Setup, &Signal, SignalStride, Log2N, FFT_FORWARD);

	t1 = Clock();

	// Average the time over all the loop iterations.
	Time = ClockToSeconds(t1, t0) / Iterations;

	printf("\tvDSP_fft_zip on %lu elements takes %g microseconds.\n",
		(unsigned long) N, Time * 1e6);

	// Release resources.
	free(Signal.realp);
	free(Signal.imagp);
}
Пример #3
0
void benchmark_ffts(int N, int cplx) {
  int Nfloat = (cplx ? N*2 : N);
  int Nbytes = Nfloat * sizeof(float);
  float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);

  double t0, t1, flops;

  int k;
  int max_iter = 5120000/N*4;
#ifdef __arm__
  max_iter /= 4;
#endif
  int iter;

  for (k = 0; k < Nfloat; ++k) {
    X[k] = 0; //sqrtf(k+1);
  }

  // FFTPack benchmark
  {
    float *wrk = malloc(2*Nbytes + 15*sizeof(float));
    int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
    if (cplx) cffti(N, wrk);
    else      rffti(N, wrk);
    t0 = uclock_sec();  
    
    for (iter = 0; iter < max_iter_; ++iter) {
      if (cplx) {
        cfftf(N, X, wrk);
        cfftb(N, X, wrk);
      } else {
        rfftf(N, X, wrk);
        rfftb(N, X, wrk);
      }
    }
    t1 = uclock_sec();
    free(wrk);
    
    flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
    show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
  }

#ifdef HAVE_VECLIB
  int log2N = (int)(log(N)/log(2) + 0.5f);
  if (N == (1<<log2N)) {
    FFTSetup setup;

    setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
    DSPSplitComplex zsamples;
    zsamples.realp = &X[0];
    zsamples.imagp = &X[Nfloat/2];
    t0 = uclock_sec();  
    for (iter = 0; iter < max_iter; ++iter) {
      if (cplx) {
        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
      } else {
        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); 
        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
      }
    }
    t1 = uclock_sec();
    vDSP_destroy_fftsetup(setup);

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
    show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
  } else {
    show_output("vDSP", N, cplx, -1, -1, -1, -1);
  }
#endif
  
#ifdef HAVE_FFTW
  {
    fftwf_plan planf, planb;
    fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
    fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
    memset(in, 0, sizeof(fftw_complex) * N);
    int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE);  // measure takes a lot of time on largest ffts
    //int flags = FFTW_ESTIMATE;
    if (cplx) {
      planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
      planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
    } else {
      planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
      planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
    }

    t0 = uclock_sec();  
    for (iter = 0; iter < max_iter; ++iter) {
      fftwf_execute(planf);
      fftwf_execute(planb);
    }
    t1 = uclock_sec();

    fftwf_destroy_plan(planf);
    fftwf_destroy_plan(planb);
    fftwf_free(in); fftwf_free(out);

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
    show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
  }
#endif  

  // PFFFT benchmark
  {
    PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
    if (s) {
      t0 = uclock_sec();  
      for (iter = 0; iter < max_iter; ++iter) {
        pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
        pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
      }
      t1 = uclock_sec();
      pffft_destroy_setup(s);
    
      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
      show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
    }
  }

  if (!array_output_format) {
    printf("--\n");
  }

  pffft_aligned_free(X);
  pffft_aligned_free(Y);
  pffft_aligned_free(Z);
}