void radix2DitCooleyTukeyFft(int K, int* indices, Complex* x, Complex* f) { int i; int N; int j; int k; int step; int eI; int oI; Complex t; float fftSin; float fftCos; float arg; calcFftIndices(K, indices); // andreolb: not measuing performance // kernel_invocations = 0; for(i = 0, N = 1 << (i + 1); N <= K; i++, N = 1 << (i + 1)) { for(j = 0; j < K; j += N) { step = N >> 1; for (k = 0; k < step; k++) { // andreolb: not measuing performance /* #if PROFILE_MODE == 2 t_kernel_precise_start(); kernel_invocations ++; #endif //PROFILE_MODE == 2 */ arg = (float)k / N; eI = j + k; oI = j + step + k; fftSinCos(arg, &fftSin, &fftCos); t = x[indices[eI]]; // andreolb: not measuing performance /* #if PROFILE_MODE == 2 t_kernel_precise_stop(); #endif //PROFILE_MODE == 2 */ x[indices[eI]].real = t.real + (x[indices[oI]].real * fftCos - x[indices[oI]].imag * fftSin); x[indices[eI]].imag = t.imag + (x[indices[oI]].imag * fftCos + x[indices[oI]].real * fftSin); x[indices[oI]].real = t.real - (x[indices[oI]].real * fftCos - x[indices[oI]].imag * fftSin); x[indices[oI]].imag = t.imag - (x[indices[oI]].imag * fftCos + x[indices[oI]].real * fftSin); } } } for(i = 0; i < K; ++i) f[i] = x[indices[i]]; }
/* Plain vanilla, unoptimized, platform-independent twist */ static MFFTReturn fft1DTwistSmall( MatrixFFTPlan mfftPlan, FFTComplex *buf, bool forward, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess) { RFASSERT((mfftPlan->sinTableType == STT_Standard) || (mfftPlan->sinTableType == STT_External)); FFTFloat imagSign = forward ? -1.0 : 1.0; #if DUMP_MATRIX #if FFT_SPLIT_COMPLEX FFTComplex start; fftComplexOffset(buf, startRow * numCols, &start); dumpMatrixRect("fft1DTwistSmall input", &start, rowsToProcess, numCols); #else FFTComplex *start = buf + (startRow * numCols); dumpMatrixRect("fft1DTwistSmall input", start, rowsToProcess, numCols); #endif /* FFT_SPLIT_COMPLEX */ #endif /* DUMP_MATRIX */ size_t row = startRow; for(size_t rowDex=0; rowDex<rowsToProcess; rowDex++, row++) { PolyComplex pc(buf, row * numCols); for(size_t col=0; col<numCols; col++) { FFTFloat cosv, sinv; fftSinCos(mfftPlan, row*col, &cosv, &sinv); sinv *= imagSign; FFTFloat r = (cosv * pc.real()) - (sinv * pc.imag()); FFTFloat i = (cosv * pc.imag()) + (sinv * pc.real()); pc.real(r); pc.imag(i); ++pc; } } #if FFT_SPLIT_COMPLEX dumpMatrixRect("fft1DTwistSmall output", &start, rowsToProcess, numCols); #else dumpMatrixRect("fft1DTwistSmall output", start, rowsToProcess, numCols); #endif return MR_Success; }
/* * Intel, precision-independent. */ static MFFTReturn fft1DTwistOpt( MatrixFFTPlan mfftPlan, FFTComplex *buf, bool forward, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess) { FFTFloat imagSign = forward ? -1.0 : 1.0; FFTVector vImagSign = FFTVectSet1(imagSign); size_t lastRow = startRow + rowsToProcess; #if DUMP_MATRIX #if FFT_SPLIT_COMPLEX FFTComplex start; fftComplexOffset(buf, startRow * numCols, &start); dumpMatrixRect("fft1DTwistOpt input", &start, rowsToProcess, numCols); #else FFTComplex *start = buf + (startRow * numCols); dumpMatrixRect("fft1DTwistOpt input", start, rowsToProcess, numCols); #endif /* FFT_SPLIT_COMPLEX */ #endif /* DUMP_MATRIX */ for(size_t row=startRow; row<lastRow; row++) { size_t rowOff = numCols * row; PolyComplex pc(buf, rowOff); FFTVector vTempCos; FFTVector vCurCos; FFTVector vCurSin; FFTVector vIncA; FFTVector vIncB; FFTVectUnion transferCos; FFTVectUnion transferSin; unsigned angleIndex; // set up initial sin & cos vectors if(mfftPlan->sinPeriod) { for(angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) { fftSinCosOpt(mfftPlan, row, angleIndex, &transferCos.f[angleIndex], &transferSin.f[angleIndex]); } } else { for(angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) { fftSinCos(mfftPlan, row*angleIndex, &transferCos.f[angleIndex], &transferSin.f[angleIndex]); } } vCurCos = transferCos.v; vCurSin = transferSin.v; // angle of increment between steps, FFT_FLOATS_PER_VECTOR steps, since // each vector has FFT_FLOATS_PER_VECTOR elements FFTFloat incA, incB; if(mfftPlan->sinPeriod) { fftSinCosOpt(mfftPlan, row, FFT_FLOATS_PER_VECTOR / 2, NULL, &incA); incA = incA*incA*2; fftSinCosOpt(mfftPlan, row, FFT_FLOATS_PER_VECTOR, NULL, &incB); } else { size_t incAngle = row * FFT_FLOATS_PER_VECTOR; fftSinCos(mfftPlan, incAngle / 2, NULL, &incA); incA = incA*incA*2; fftSinCos(mfftPlan, incAngle, NULL, &incB); } vIncA = FFTVectSet1(incA); vIncB = FFTVectSet1(incB); for (size_t col=0; col<numCols; col+=FFT_FLOATS_PER_VECTOR) { FFTVector vRTop; FFTVector vITop; // prefetch these pc.loadVect(vRTop, vITop); FFTVector vcosv = vCurCos; FFTVector vsinv = vCurSin; if(col < (numCols - FFT_FLOATS_PER_VECTOR - 1)) { /* Update vCurSin and vCurCos unless we're at end of row. */ if((FFT_SIN_RECALC_COMPLEX > 0) && ((col % FFT_SIN_RECALC_COMPLEX) == (unsigned)(FFT_SIN_RECALC_COMPLEX - FFT_FLOATS_PER_VECTOR))) { size_t newCol = col + FFT_FLOATS_PER_VECTOR; for (angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) { /* * Note that we might be using a fully populated sine table even if * we're configured with FFT_SIN_RECALC_COMPLEX > 0. This happens * when we're running as a subplan of a 1-D real FFT. In that * case we're using the 1-D real's sine table, which is always fully * populated. */ if(mfftPlan->sinPeriod) { fftSinCosOpt(mfftPlan, row, newCol+angleIndex, &transferCos.f[angleIndex], &transferSin.f[angleIndex]); } else { fftSinCos(mfftPlan, row*(newCol+angleIndex), &transferCos.f[angleIndex], &transferSin.f[angleIndex]); } } vCurCos = transferCos.v; vCurSin = transferSin.v; } else { // vTempCos = vCurCos - incA*curCos - incB*curSin; vTempCos = FFTVectSub(vCurCos, FFTVectAdd(FFTVectMul(vIncA, vCurCos), FFTVectMul(vIncB, vCurSin))); // curSin = curSin - incA*curSin + incB*curCos; vCurSin = FFTVectSub(vCurSin, FFTVectSub(FFTVectMul(vIncA, vCurSin), FFTVectMul(vIncB, vCurCos))); vCurCos = vTempCos; } } // sinv *= imagSign; vsinv = FFTVectMul(vsinv, vImagSign); // real = (cosv * rTop) - (sinv * iTop); FFTVector vr = FFTVectSub(FFTVectMul(vcosv, vRTop), FFTVectMul(vsinv, vITop)); // imag = (cosv * iTop) + (sinv * rTop); FFTVector vi = FFTVectAdd(FFTVectMul(vcosv, vITop), FFTVectMul(vsinv, vRTop)); pc.storeVect(vr, vi); pc.offset(FFT_FLOATS_PER_VECTOR); } } #if FFT_SPLIT_COMPLEX dumpMatrixRect("fft1DTwistOpt output", &start, rowsToProcess, numCols); #else dumpMatrixRect("fft1DTwistOpt output", start, rowsToProcess, numCols); #endif return MR_Success; }