Esempio n. 1
0
void radix2DitCooleyTukeyFft(int K, int* indices, Complex* x, Complex* f) {

    int i;
    int N;
    int j;
    int k;
    int step;
    int eI;
    int oI;
    Complex t;
    float fftSin;
    float fftCos;
    float arg;

    calcFftIndices(K, indices);
    
    // andreolb: not measuing performance
    // kernel_invocations = 0;

    for(i = 0, N = 1 << (i + 1); N <= K; i++, N = 1 << (i + 1)) {
        for(j = 0; j < K; j += N) {
            step = N >> 1;
            for (k = 0; k < step; k++) {
                
// andreolb: not measuing performance
/*
#if PROFILE_MODE == 2
                t_kernel_precise_start();
                kernel_invocations ++;
#endif //PROFILE_MODE == 2
*/

                arg = (float)k / N;
                eI = j + k;
                oI = j + step + k;

                fftSinCos(arg, &fftSin, &fftCos);

                t = x[indices[eI]];
   
// andreolb: not measuing performance
/*
#if PROFILE_MODE == 2
                t_kernel_precise_stop();
#endif //PROFILE_MODE == 2
*/

                x[indices[eI]].real = t.real + (x[indices[oI]].real * fftCos - x[indices[oI]].imag * fftSin);
                x[indices[eI]].imag = t.imag + (x[indices[oI]].imag * fftCos + x[indices[oI]].real * fftSin);

                x[indices[oI]].real = t.real - (x[indices[oI]].real * fftCos - x[indices[oI]].imag * fftSin);
                x[indices[oI]].imag = t.imag - (x[indices[oI]].imag * fftCos + x[indices[oI]].real * fftSin);
            }
        }
    }

    for(i = 0; i < K; ++i)
        f[i] = x[indices[i]];
        
}
/* Plain vanilla, unoptimized, platform-independent twist */
static MFFTReturn fft1DTwistSmall(
	MatrixFFTPlan		mfftPlan,
	FFTComplex			*buf,
	bool				forward,	
	size_t				numRows,
	size_t				numCols,
	size_t				startRow,
	size_t				rowsToProcess)
{
	RFASSERT((mfftPlan->sinTableType == STT_Standard) ||
	         (mfftPlan->sinTableType == STT_External));

	FFTFloat imagSign = forward ? -1.0 : 1.0;
	
	#if	DUMP_MATRIX
	#if FFT_SPLIT_COMPLEX
	FFTComplex start;
	fftComplexOffset(buf, startRow * numCols, &start);
	dumpMatrixRect("fft1DTwistSmall input", &start, rowsToProcess, numCols);
	#else
	FFTComplex *start = buf + (startRow * numCols);
	dumpMatrixRect("fft1DTwistSmall input", start, rowsToProcess, numCols);
	#endif	/* FFT_SPLIT_COMPLEX */
	#endif	/* DUMP_MATRIX */
	
	size_t row = startRow;
	for(size_t rowDex=0; rowDex<rowsToProcess; rowDex++, row++) {
		PolyComplex pc(buf, row * numCols);
		for(size_t col=0; col<numCols; col++) {
			FFTFloat cosv, sinv;
			fftSinCos(mfftPlan, row*col, &cosv, &sinv);
			
			sinv *= imagSign;
			FFTFloat r = (cosv * pc.real()) - (sinv * pc.imag());
			FFTFloat i = (cosv * pc.imag()) + (sinv * pc.real());
			pc.real(r);
			pc.imag(i);
			++pc;
			
		}
	}

	#if FFT_SPLIT_COMPLEX
	dumpMatrixRect("fft1DTwistSmall output", &start, rowsToProcess, numCols);
	#else
	dumpMatrixRect("fft1DTwistSmall output", start, rowsToProcess, numCols);
	#endif
	
	return MR_Success;
}
/*
 * Intel, precision-independent. 
 */
static MFFTReturn fft1DTwistOpt(
	MatrixFFTPlan		mfftPlan,
	FFTComplex			*buf,
	bool				forward,	
	size_t				numRows,	
	size_t				numCols,
	size_t				startRow,	
	size_t				rowsToProcess)
{
	FFTFloat 	imagSign = forward ? -1.0 : 1.0;
	FFTVector	vImagSign = FFTVectSet1(imagSign);
	size_t		lastRow = startRow + rowsToProcess;
	
	#if	DUMP_MATRIX
	#if FFT_SPLIT_COMPLEX
	FFTComplex start;
	fftComplexOffset(buf, startRow * numCols, &start);
	dumpMatrixRect("fft1DTwistOpt input", &start, rowsToProcess, numCols);
	#else
	FFTComplex *start = buf + (startRow * numCols);
	dumpMatrixRect("fft1DTwistOpt input", start, rowsToProcess, numCols);
	#endif	/* FFT_SPLIT_COMPLEX */
	#endif	/* DUMP_MATRIX */

	for(size_t row=startRow; row<lastRow; row++) {
		size_t rowOff = numCols * row;
		PolyComplex pc(buf, rowOff);
		
		FFTVector		vTempCos;
		FFTVector		vCurCos;
		FFTVector		vCurSin;
		FFTVector		vIncA;
		FFTVector		vIncB;
		FFTVectUnion	transferCos;
		FFTVectUnion	transferSin;
		unsigned		angleIndex;
		
		// set up initial sin & cos vectors
		if(mfftPlan->sinPeriod) {
			for(angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) {
				fftSinCosOpt(mfftPlan, row, angleIndex, 
					&transferCos.f[angleIndex], &transferSin.f[angleIndex]);
			}
		}
		else {
			for(angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) {
				fftSinCos(mfftPlan, row*angleIndex, 
					&transferCos.f[angleIndex], &transferSin.f[angleIndex]);
			}
		}
		
		vCurCos = transferCos.v;
		vCurSin = transferSin.v;

		// angle of increment between steps, FFT_FLOATS_PER_VECTOR steps, since
		// each vector has FFT_FLOATS_PER_VECTOR elements
		
		FFTFloat incA, incB;
		if(mfftPlan->sinPeriod) {
			fftSinCosOpt(mfftPlan, row, FFT_FLOATS_PER_VECTOR / 2, NULL, &incA);
			incA = incA*incA*2;
			fftSinCosOpt(mfftPlan, row, FFT_FLOATS_PER_VECTOR, NULL, &incB);
		}
		else {
			size_t incAngle = row * FFT_FLOATS_PER_VECTOR;
			fftSinCos(mfftPlan, incAngle / 2, NULL, &incA);
			incA = incA*incA*2;
			fftSinCos(mfftPlan, incAngle, NULL, &incB);
		}
		vIncA = FFTVectSet1(incA);
		vIncB = FFTVectSet1(incB);
		
		for (size_t col=0; col<numCols; col+=FFT_FLOATS_PER_VECTOR) {
			FFTVector vRTop;
			FFTVector vITop;

			// prefetch these
			pc.loadVect(vRTop, vITop);
									
			FFTVector vcosv = vCurCos;
			FFTVector vsinv = vCurSin;

			if(col < (numCols - FFT_FLOATS_PER_VECTOR - 1)) {
				/* Update vCurSin and vCurCos unless we're at end of row. */
				if((FFT_SIN_RECALC_COMPLEX > 0) && 
				   ((col % FFT_SIN_RECALC_COMPLEX) == 
				       (unsigned)(FFT_SIN_RECALC_COMPLEX - FFT_FLOATS_PER_VECTOR))) {
					size_t newCol = col + FFT_FLOATS_PER_VECTOR;
					for (angleIndex = 0; angleIndex < FFT_FLOATS_PER_VECTOR; angleIndex++) { 
						/*
						 * Note that we might be using a fully populated sine table even if 
						 * we're configured with FFT_SIN_RECALC_COMPLEX > 0. This happens
						 * when we're running as a subplan of a 1-D real FFT. In that
						 * case we're using the 1-D real's sine table, which is always fully
						 * populated.
						 */
						if(mfftPlan->sinPeriod) {
							fftSinCosOpt(mfftPlan, row, newCol+angleIndex, 
								&transferCos.f[angleIndex], &transferSin.f[angleIndex]);
						}
						else {
							fftSinCos(mfftPlan, row*(newCol+angleIndex), 
								&transferCos.f[angleIndex], &transferSin.f[angleIndex]);
						}
					}
					vCurCos = transferCos.v;
					vCurSin = transferSin.v;
				}
				else {
					// vTempCos = vCurCos - incA*curCos - incB*curSin;					
					vTempCos = FFTVectSub(vCurCos, FFTVectAdd(FFTVectMul(vIncA, vCurCos),
															  FFTVectMul(vIncB, vCurSin)));
					// curSin = curSin - incA*curSin + incB*curCos;
					vCurSin = FFTVectSub(vCurSin, FFTVectSub(FFTVectMul(vIncA, vCurSin),
															 FFTVectMul(vIncB, vCurCos)));
					vCurCos = vTempCos;
				}
			}
			
			// sinv *= imagSign;
			vsinv = FFTVectMul(vsinv, vImagSign);
			
			// real = (cosv * rTop) - (sinv * iTop);
			FFTVector vr = FFTVectSub(FFTVectMul(vcosv, vRTop), FFTVectMul(vsinv, vITop));
						
			// imag = (cosv * iTop) + (sinv * rTop);
			FFTVector vi = FFTVectAdd(FFTVectMul(vcosv, vITop), FFTVectMul(vsinv, vRTop));

			pc.storeVect(vr, vi);
			pc.offset(FFT_FLOATS_PER_VECTOR);
		}
	}
	#if FFT_SPLIT_COMPLEX
	dumpMatrixRect("fft1DTwistOpt output", &start, rowsToProcess, numCols);
	#else
	dumpMatrixRect("fft1DTwistOpt output", start, rowsToProcess, numCols);
	#endif
	return MR_Success;
}