void bspfft1d(double **a, int n1, int nlr,int nlc, int M, int N, int s, int t, int sign, double *w0, double *w, double *tw, int *rho_np, int *rho_p, double *pa, int col){ /** * a = local matrix * n1 = # of total columns (global) * nlr = # of local rows (how many fft on the rows we are doing) * nlc = # of local cols (the length of the 1D fft we are doing) * M,N = # of proc rows,cols * s,t = whoami (2D numbering) * sign = 1/-1 (fft/ifft) * w0,w,tw,rho_np,rho_p = tables necessary for 1D ffts * pa = pointer to the beginning of a (necessary to know where to put stuff) * col = 1 if in reality we are doing fft on the columns (locally it seems like rows, but something is different) * 0 otherwise (really rows) */ char rev; int k1, r, c0, c, ntw, j,i; double ninv; k1= k1_init(n1,N,nlc); // 1 step: for every row, permute and compute a local, unordered fft for(i=0;i<nlr;i++){ permute(a[i],nlc,rho_np); rev= TRUE; for(r=0; r<nlc/k1; r++) ufft(&a[i][2*r*k1],k1,sign,w0); } c0= 1; ntw= 0; for (c=k1; c<=N; c *=nlc){ //2 step: for every row redistribute it (according to col) for(i=0;i<nlr;i++) bspredistr(a[i],i,nlc,M,N,s,t,c0,c,rev,rho_p,pa,col); bsp_sync(); //sync is done only after every row has been redistributed rev= FALSE; //3 step: twiddle and perform an unordered fft on every row for(i=0;i<nlr;i++){ twiddle(a[i],nlc,sign,&tw[2*ntw*nlc]); ufft(a[i],nlc,sign,w); } c0= c; ntw++; } //if sign=-1 we are interested in computing the inverse fft if (sign==-1){ ninv= 1 / (double)n1; for(i=0;i<nlr;i++) { for(j=0; j<2*nlc; j++) a[i][j] *= ninv; } } } /* end bspfft */
int test(void) { int i = 0; led_init(); buzzer_init(); k1_init(); while(1) { if(k1_is_down()) { buzzer_on(); led_on(i%4 + 1); delay(1); buzzer_off(); led_off(i%4 + 1); delay(1); i++; } } return 0; }
void bspfft1d_init(int n1, int N, int s, int t, double *w0, double *w, double *tw, int *rho_np, int *rho_p){ /* This parallel function initializes all the tables used in the FFT. */ int nlc, k1, ntw, c; double alpha; nlc= nloc(N,t,n1); bitrev_init(nlc,rho_np); bitrev_init(N,rho_p); k1= k1_init(n1,N,nlc); ufft_init(k1,w0); ufft_init(nlc,w); ntw= 0; for (c=k1; c<=N; c *=nlc){ alpha= (t%c) / (double)(c); twiddle_init(nlc,alpha,rho_np,&tw[2*ntw*nlc]); ntw++; } } /* end bspfft_init */
void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */