void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) { FFTSetup plan_vdsp; DSPSplitComplex out_vdsp; FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; unsigned int i, j, k; unsigned int stride; unsigned int log2Nx = (unsigned int) log2(n.x); unsigned int log2Ny = (unsigned int) log2(n.y); unsigned int log2Nz = (unsigned int) log2(n.z); unsigned int log2N; log2N = log2Nx; log2N = log2N > log2Ny ? log2N : log2Ny; log2N = log2N > log2Nz ? log2N : log2Nz; plan_vdsp = vDSP_create_fftsetup(log2N, 2); switch(dim) { case clFFT_1D: for(i = 0; i < batchSize; i++) { stride = i * n.x; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } break; case clFFT_2D: for(i = 0; i < batchSize; i++) { for(j = 0; j < n.y; j++) { stride = j * n.x + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } for(i = 0; i < batchSize; i++) { for(j = 0; j < n.x; j++) { stride = j + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } break; case clFFT_3D: for(i = 0; i < batchSize; i++) { for(j = 0; j < n.z; j++) { for(k = 0; k < n.y; k++) { stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } } for(i = 0; i < batchSize; i++) { for(j = 0; j < n.z; j++) { for(k = 0; k < n.x; k++) { stride = k + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } } for(i = 0; i < batchSize; i++) { for(j = 0; j < n.y; j++) { for(k = 0; k < n.x; k++) { stride = k + j * n.x + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); } } } break; } vDSP_destroy_fftsetup(plan_vdsp); }
/* Demonstrate the complex one-dimensional in-place FFT, vDSP_fft_zip. The in-place FFT writes results into the same array that contains the input data. This may be faster than an out-of-place routine because it uses less memory (so there is less data to load from memory and a greater chance of keeping data in cache). */ static void DemonstratevDSP_fft_zip(FFTSetup Setup) { /* Define a stride for the array be passed to the FFT. In many applications, the stride is one and is passed to the vDSP routine as a constant. */ const vDSP_Stride SignalStride = 1; // Define a variable for a loop iterator. vDSP_Length i; // Define some variables used to time the routine. ClockData t0, t1; double Time; printf("\n\tOne-dimensional complex FFT of %lu elements.\n", (unsigned long) N); // Allocate memory for the arrays. DSPSplitComplex Signal; Signal.realp = malloc(N * SignalStride * sizeof Signal.realp); Signal.imagp = malloc(N * SignalStride * sizeof Signal.imagp); if (Signal.realp == NULL || Signal.imagp == NULL) { fprintf(stderr, "Error, failed to allocate memory.\n"); exit(EXIT_FAILURE); } /* Generate an input signal. In a real application, data would of course be provided from an image file, sensors, or other source. */ const float Frequency0 = 400, Frequency1 = 623, Frequency2 = 931; const float Phase0 = .618, Phase1 = .7f, Phase2 = .125; for (i = 0; i < N; ++i) { Signal.realp[i*SignalStride] = cos((i * Frequency0 / N + Phase0) * TwoPi) + cos((i * Frequency1 / N + Phase1) * TwoPi) + cos((i * Frequency2 / N + Phase2) * TwoPi); Signal.imagp[i*SignalStride] = sin((i * Frequency0 / N + Phase0) * TwoPi) + sin((i * Frequency1 / N + Phase1) * TwoPi) + sin((i * Frequency2 / N + Phase2) * TwoPi); } // Perform an FFT. vDSP_fft_zip(Setup, &Signal, SignalStride, Log2N, FFT_FORWARD); /* Prepare expected results based on analytical transformation of the input signal. */ DSPSplitComplex Expected; Expected.realp = malloc(N * sizeof Expected.realp); Expected.imagp = malloc(N * sizeof Expected.imagp); if (Expected.realp == NULL || Expected.imagp == NULL) { fprintf(stderr, "Error, failed to allocate memory.\n"); exit(EXIT_FAILURE); } for (i = 0; i < N; ++i) Expected.realp[i] = Expected.imagp[i] = 0; // Add the frequencies in the signal to the expected results. Expected.realp[(int) Frequency0] = N * cos(Phase0 * TwoPi); Expected.imagp[(int) Frequency0] = N * sin(Phase0 * TwoPi); Expected.realp[(int) Frequency1] = N * cos(Phase1 * TwoPi); Expected.imagp[(int) Frequency1] = N * sin(Phase1 * TwoPi); Expected.realp[(int) Frequency2] = N * cos(Phase2 * TwoPi); Expected.imagp[(int) Frequency2] = N * sin(Phase2 * TwoPi); // Compare the observed results to the expected results. CompareComplexVectors(Expected, Signal, N); // Release memory. free(Expected.realp); free(Expected.imagp); /* The above shows how to use the vDSP_fft_zip routine. Now we will see how fast it is. */ /* Zero the signal before timing because repeated FFTs on non-zero data can cause abnormalities such as infinities, NaNs, and subnormal numbers. */ for (i = 0; i < N; ++i) Signal.realp[i] = Signal.imagp[i] = 0; // Time vDSP_fft_zip by itself. t0 = Clock(); for (i = 0; i < Iterations; ++i) vDSP_fft_zip(Setup, &Signal, SignalStride, Log2N, FFT_FORWARD); t1 = Clock(); // Average the time over all the loop iterations. Time = ClockToSeconds(t1, t0) / Iterations; printf("\tvDSP_fft_zip on %lu elements takes %g microseconds.\n", (unsigned long) N, Time * 1e6); // Release resources. free(Signal.realp); free(Signal.imagp); }
void benchmark_ffts(int N, int cplx) { int Nfloat = (cplx ? N*2 : N); int Nbytes = Nfloat * sizeof(float); float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes); double t0, t1, flops; int k; int max_iter = 5120000/N*4; #ifdef __arm__ max_iter /= 4; #endif int iter; for (k = 0; k < Nfloat; ++k) { X[k] = 0; //sqrtf(k+1); } // FFTPack benchmark { float *wrk = malloc(2*Nbytes + 15*sizeof(float)); int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1; if (cplx) cffti(N, wrk); else rffti(N, wrk); t0 = uclock_sec(); for (iter = 0; iter < max_iter_; ++iter) { if (cplx) { cfftf(N, X, wrk); cfftb(N, X, wrk); } else { rfftf(N, X, wrk); rfftb(N, X, wrk); } } t1 = uclock_sec(); free(wrk); flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_); } #ifdef HAVE_VECLIB int log2N = (int)(log(N)/log(2) + 0.5f); if (N == (1<<log2N)) { FFTSetup setup; setup = vDSP_create_fftsetup(log2N, FFT_RADIX2); DSPSplitComplex zsamples; zsamples.realp = &X[0]; zsamples.imagp = &X[Nfloat/2]; t0 = uclock_sec(); for (iter = 0; iter < max_iter; ++iter) { if (cplx) { vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse); } else { vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse); } } t1 = uclock_sec(); vDSP_destroy_fftsetup(setup); flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html show_output("vDSP", N, cplx, flops, t0, t1, max_iter); } else { show_output("vDSP", N, cplx, -1, -1, -1, -1); } #endif #ifdef HAVE_FFTW { fftwf_plan planf, planb; fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N); fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N); memset(in, 0, sizeof(fftw_complex) * N); int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE); // measure takes a lot of time on largest ffts //int flags = FFTW_ESTIMATE; if (cplx) { planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags); planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags); } else { planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags); planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags); } t0 = uclock_sec(); for (iter = 0; iter < max_iter; ++iter) { fftwf_execute(planf); fftwf_execute(planb); } t1 = uclock_sec(); fftwf_destroy_plan(planf); fftwf_destroy_plan(planb); fftwf_free(in); fftwf_free(out); flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter); } #endif // PFFFT benchmark { PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL); if (s) { t0 = uclock_sec(); for (iter = 0; iter < max_iter; ++iter) { pffft_transform(s, X, Z, Y, PFFFT_FORWARD); pffft_transform(s, X, Z, Y, PFFFT_BACKWARD); } t1 = uclock_sec(); pffft_destroy_setup(s); flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html show_output("PFFFT", N, cplx, flops, t0, t1, max_iter); } } if (!array_output_format) { printf("--\n"); } pffft_aligned_free(X); pffft_aligned_free(Y); pffft_aligned_free(Z); }