double bspip(int p, int s, int n, double *x, double *y){ /* Compute inner product of vectors x and y of length n>=0 */ int nloc(int p, int s, int n); double inprod, *Inprod, alpha; int i, t; Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL); bsp_sync(); inprod= 0.0; for (i=0; i<nloc(p,s,n); i++){ inprod += x[i]*y[i]; } for (t=0; t<p; t++){ bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL); } bsp_sync(); alpha= 0.0; for (t=0; t<p; t++){ alpha += Inprod[t]; } bsp_pop_reg(Inprod); vecfreed(Inprod); return alpha; } /* end bspip */
void bspredistr(double *x, int i, int length, int M, int N, int s, int t, int c0, int c1,char rev, int *rho_p, double *pm, int col){ /* This function redistributes the complex vector x of length n, col = 0 means that we are considering proc rows col = 1 means that we are considering proc columns */ double *tmp; int j0, j2, j, jglob, ratio, size; int npackets, destproc, destindex, r; ratio= c1/c0; size= MAX(length/ratio,1); npackets= length/size; tmp= vecallocd(2*size); if (rev) { j0= rho_p[t]%c0; j2= rho_p[t]/c0; } else { j0= t%c0; j2= t/c0; } for(j=0; j<npackets; j++){ jglob= j2*c0*length + j*c0 + j0; destproc = (jglob/(c1*length))*c1 + jglob%c1; destproc = (col == 0 ? s+M*destproc : N*s+destproc); /* * the first term of the sum is because we don't really know * the address of a[i] in the destproc, so we start from the * beginning of a and jump */ destindex = (jglob%(c1*length))/c1; for(r=0; r<size; r++){ tmp[2*r]=x[2*(j+r*ratio)]; tmp[2*r+1]= x[2*(j+r*ratio)+1]; } destindex= i*length+destindex; bsp_put(destproc,tmp,pm,destindex*2*SZDBL,size*2*SZDBL); } vecfreed(tmp); } /* end bspredistr */
void bspinprod(){ double bspip(int p, int s, int n, double *x, double *y); int nloc(int p, int s, int n); double *x, alpha, time0, time1; int p, s, n, nl, i, iglob; bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ if (s==0){ printf("Please enter n:\n"); fflush(stdout); scanf("%d",&n); if(n<0) bsp_abort("Error in input: n is negative"); } bsp_push_reg(&n,SZINT); bsp_sync(); bsp_get(0,&n,0,&n,SZINT); bsp_sync(); bsp_pop_reg(&n); nl= nloc(p,s,n); x= vecallocd(nl); for (i=0; i<nl; i++){ iglob= i*p+s; x[i]= iglob+1; } bsp_sync(); time0=bsp_time(); alpha= bspip(p,s,n,x,x); bsp_sync(); time1=bsp_time(); printf("Processor %d: sum of squares up to %d*%d is %.lf\n", s,n,n,alpha); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); } vecfreed(x); bsp_end(); } /* end bspinprod */
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; /**** Determine p ****/ bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*MAXH+p); bsp_push_reg(dest,(2*MAXH+p)*SZDBL); bsp_sync(); /**** Determine r ****/ for (n=1; n <= MAXN; n *= 2){ /* Initialize scalars and vectors */ alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } /* Measure time of 2*NITERS DAXPY operations of length n */ time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); /* Processor 0 determines minimum, maximum, average30 INTRODUCTION computing rate */ if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ /* Compute r = average computing rate in flop/s */ nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", n, nflops/(maxtime*MEGA),nflops/ (mintime*MEGA), r/MEGA); fflush(stdout); /* Output for fooling benchmark-detecting compilers */ printf(" fool=%7.1lf\n",y[n-1]+z[n-1]); } else printf("minimum time is 0\n"); fflush(stdout); } } /**** Determine g and l ****/ for (h=0; h<=MAXH; h++){ /* Initialize communication pattern */ for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { /* destination processor is one of the p-1 others */ destproc[i]= (s+1 + i%(p-1)) %p; /* destination index is in my own part of dest */ destindex[i]= s + (i/(p-1))*p; } } /* Measure time of NITERS h-relations */ bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) bsp_put(destproc[i],&src[i],dest,destindex[i]*SZDBL, SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; /* Compute time of one h-relation */ if (s==0){ t[h]= (time*r)/NITERS; printf("Time of %5d-relation= %lf sec= %8.0lf flops\n", h, time/NITERS, t[h]); fflush(stdout); } } if (s==0){ printf("size of double = %d bytes\n",(int)SZDBL); leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); printf("The bottom line for this BSP computer is:\n"); printf("p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */
void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; size_t pin[100]; // Determine p // start: new code for pinning for (i=0; i< tnode->length; i++) pin[i] = tnode->sons[i]->index; mcbsp_set_pinning( pin, tnode->length ); bsp_begin(tnode->length); // end: new code for pinning p= bsp_nprocs(); // p = number of processors obtained s= bsp_pid(); // s = processor number Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*(MAXH+p)); bsp_push_reg(dest,(2*(MAXH+p))*SZDBL); bsp_sync(); // Determine r for (n=1; n < MAXN; n *= 2){ // Initialize scalars and vectors alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } // Measure time of 2*NITERS DAXPY operations of length n time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); // Processor 0 determines minimum, maximum, average computing rate if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ // Compute r = average computing rate in flop/s nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; //printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", // n, nflops/(maxtime*MEGA),nflops/(mintime*MEGA), r/MEGA); //fflush(stdout); // Output for fooling benchmark-detecting compilers printf( "", y[n-1]+z[n-1] ); } } } // Determine g and l for (h=0; h<=MAXH; h++){ // Initialize communication pattern for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } // Measure time of NITERS h-relations bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) { //bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); //bsp_get(destproc[i], dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_put(destproc[i], &src[i] , dest , destindex[i]*SZDBL, SZDBL); } //if (s == 0) // bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; // Compute time of one h-relation if (s==0){ t[h]= (time*r)/NITERS; //#define SEHLOC_BENCH_VERBOSE #ifdef SEHLOC_BENCH_VERBOSE char strnodes[256]; sprintf(strnodes, ""); for (i=0; i<tnode->length; i++) { sprintf(strnodes, "%s %d", strnodes, tnode->sons[i]->index); } printf("SEH# Level%d %5d %lf %8.0lf\n", tnode->level, h, time/NITERS, t[h]); fflush(stdout); #endif } } if (s==0){ leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); g=(g>0)? g: g0*2; printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); //printf("plot# %d %.1lf %.1lf\n",tnode->level, g,l); printf("The bottom line for this MultiBSP component is:\n"); printf("<p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf>\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */