示例#1
0
double bspip(int p, int s, int n, double *x, double *y){
    /* Compute inner product of vectors x and y of length n>=0 */

    int nloc(int p, int s, int n);
    double inprod, *Inprod, alpha;
    int i, t;
  
    Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL);
    bsp_sync();

    inprod= 0.0;
    for (i=0; i<nloc(p,s,n); i++){
        inprod += x[i]*y[i];
    }
    for (t=0; t<p; t++){
        bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL);
    }
    bsp_sync();

    alpha= 0.0;
    for (t=0; t<p; t++){
        alpha += Inprod[t];
    }
    bsp_pop_reg(Inprod); vecfreed(Inprod);

    return alpha;

} /* end bspip */
示例#2
0
 void bspredistr(double *x, int i, int length, int M, int N, int s, int t,
 int c0, int c1,char rev, int *rho_p, double *pm, int col){
   
   /* This function redistributes the complex vector x of length n,
      col = 0 means that we are considering proc rows
      col = 1 means that we are considering proc columns
   */
     
   
   double *tmp;
   int j0, j2, j, jglob, ratio, size;
   int npackets, destproc, destindex, r;
   
   ratio= c1/c0;
   size= MAX(length/ratio,1);
   npackets= length/size;
   tmp= vecallocd(2*size);
   
   if (rev) {
     j0= rho_p[t]%c0;
     j2= rho_p[t]/c0;
   } else {
     j0= t%c0;
     j2= t/c0;
   }    
   for(j=0; j<npackets; j++){
     jglob= j2*c0*length + j*c0 + j0;
     destproc = (jglob/(c1*length))*c1 + jglob%c1; 
     
     destproc = (col == 0 ? s+M*destproc : N*s+destproc);
        
     /*
     * the first term of the sum is because we don't really know
     * the address of a[i] in the destproc, so we start from the
     * beginning of a and jump
     */
   destindex = (jglob%(c1*length))/c1;
   
     for(r=0; r<size; r++){
       tmp[2*r]=x[2*(j+r*ratio)];
       tmp[2*r+1]= x[2*(j+r*ratio)+1];
     }
     
       destindex= i*length+destindex;
   
     bsp_put(destproc,tmp,pm,destindex*2*SZDBL,size*2*SZDBL);
   }
   vecfreed(tmp);
   
 } /* end bspredistr */
示例#3
0
void bspinprod(){
    
    double bspip(int p, int s, int n, double *x, double *y);
    int nloc(int p, int s, int n);
    double *x, alpha, time0, time1;
    int p, s, n, nl, i, iglob;
    
    bsp_begin(P);
    p= bsp_nprocs(); /* p = number of processors obtained */ 
    s= bsp_pid();    /* s = processor number */ 
    if (s==0){
        printf("Please enter n:\n"); fflush(stdout);
        scanf("%d",&n);
        if(n<0)
            bsp_abort("Error in input: n is negative");
    }
    bsp_push_reg(&n,SZINT);
    bsp_sync();

    bsp_get(0,&n,0,&n,SZINT);
    bsp_sync();
    bsp_pop_reg(&n);

    nl= nloc(p,s,n);
    x= vecallocd(nl);
    for (i=0; i<nl; i++){
        iglob= i*p+s;
        x[i]= iglob+1;
    }
    bsp_sync(); 
    time0=bsp_time();

    alpha= bspip(p,s,n,x,x);
    bsp_sync();  
    time1=bsp_time();

    printf("Processor %d: sum of squares up to %d*%d is %.lf\n",
            s,n,n,alpha); fflush(stdout);
    if (s==0){
        printf("This took only %.6lf seconds.\n", time1-time0);
        fflush(stdout);
    }

    vecfreed(x);
    bsp_end();

} /* end bspinprod */
示例#4
0
文件: bspbench.c 项目: wf34/parallel
void bspbench(){
void leastsquares(int h0, int h1, double *t, double *g, double *l);
int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH];
double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest,
time0, time1, time, *Time, mintime, maxtime,
nflops, r, g0, l0, g, l, t[MAXH+1];
/**** Determine p ****/
bsp_begin(P);
p= bsp_nprocs(); /* p = number of processors obtained */
s= bsp_pid();
/* s = processor number */
Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL);
dest= vecallocd(2*MAXH+p); bsp_push_reg(dest,(2*MAXH+p)*SZDBL);
bsp_sync();
/**** Determine r ****/
for (n=1; n <= MAXN; n *= 2){
/* Initialize scalars and vectors */
alpha= 1.0/3.0;
beta= 4.0/9.0;
for (i=0; i<n; i++){
z[i]= y[i]= x[i]= (double)i;
}
/* Measure time of 2*NITERS DAXPY operations of length n */
time0=bsp_time();
for (iter=0; iter<NITERS; iter++){
for (i=0; i<n; i++)
y[i] += alpha*x[i];
for (i=0; i<n; i++)
z[i] -= beta*x[i];
}
time1= bsp_time();
time= time1-time0;
bsp_put(0,&time,Time,s*SZDBL,SZDBL);
bsp_sync();
/* Processor 0 determines minimum, maximum, average30
INTRODUCTION
computing rate */
if (s==0){
mintime= maxtime= Time[0];
for(s1=1; s1<p; s1++){
mintime= MIN(mintime,Time[s1]);
maxtime= MAX(maxtime,Time[s1]);
}
if (mintime>0.0){
/* Compute r = average computing rate in flop/s */
nflops= 4*NITERS*n;
r= 0.0;
for(s1=0; s1<p; s1++)
r += nflops/Time[s1];
r /= p;
printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ",
n, nflops/(maxtime*MEGA),nflops/
(mintime*MEGA), r/MEGA);
fflush(stdout);
/* Output for fooling benchmark-detecting compilers */
printf(" fool=%7.1lf\n",y[n-1]+z[n-1]);
} else
printf("minimum time is 0\n"); fflush(stdout);
}
}
/**** Determine g and l ****/
for (h=0; h<=MAXH; h++){
/* Initialize communication pattern */
for (i=0; i<h; i++){
src[i]= (double)i;
if (p==1){
destproc[i]=0;
destindex[i]=i;
} else {
/* destination processor is one of the p-1 others */
destproc[i]= (s+1 + i%(p-1)) %p;
/* destination index is in my own part of dest */
destindex[i]= s + (i/(p-1))*p;
}
}
/* Measure time of NITERS h-relations */
bsp_sync();
time0= bsp_time();
for (iter=0; iter<NITERS; iter++){
for (i=0; i<h; i++)
bsp_put(destproc[i],&src[i],dest,destindex[i]*SZDBL,
SZDBL);
bsp_sync();
}
time1= bsp_time();
time= time1-time0;
/* Compute time of one h-relation */
if (s==0){
t[h]= (time*r)/NITERS;
printf("Time of %5d-relation= %lf sec= %8.0lf flops\n",
h, time/NITERS, t[h]); fflush(stdout);
}
}
if (s==0){
printf("size of double = %d bytes\n",(int)SZDBL);
leastsquares(0,p,t,&g0,&l0);
printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0);
leastsquares(p,MAXH,t,&g,&l);
printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l);
printf("The bottom line for this BSP computer is:\n");
printf("p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf\n",
p,r/MEGA,g,l);
fflush(stdout);
}
bsp_pop_reg(dest); vecfreed(dest);
bsp_pop_reg(Time); vecfreed(Time);
bsp_end();
} /* end bspbench */
示例#5
0
void bspfft_test()
{
    void bspfft( double * x, int n, int p, int s, int sign, double * w0,
                 double * w, double * tw, int *rho_np, int *rho_p );
    void bspfft_init( int n, int p, int s, double * w0,
                      double * w, double * tw, int *rho_np, int *rho_p );
    int k1_init( int n, int p );

    int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p;
    double time0, time1, time2, ffttime, nflops,
           max_error, error_re, error_im, error,
           *Error, *x, *w0, *w, *tw;

    bsp_begin( P );
    p = bsp_nprocs();
    s = bsp_pid();

    bsp_push_reg( &n, SZINT );
    Error = vecallocd( p );
    bsp_push_reg( Error, p * SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Please enter length n: \n" );

#ifdef _WIN32
        scanf_s( "%d", &n );
#else
        scanf( "%d", &n );
#endif

        if ( n < 2 * p )
        {
            bsp_abort( "Error in input: n < 2p" );
        }

        for ( q = 1; q < p; q++ )
        {
            bsp_put( q, &n, &n, 0, SZINT );
        }
    }

    bsp_sync();

    if ( s == 0 )
    {
        printf( "FFT of vector of length %d using %d processors\n", n, p );
        printf( "performing %d forward and %d backward transforms\n",
                NITERS, NITERS );
    }

    /* Allocate, register,  and initialize vectors */
    np = n / p;
    x = vecallocd( 2 * np );
    bsp_push_reg( x, 2 * np * SZDBL );
    k1 = k1_init( n, p );
    w0 = vecallocd( k1 );
    w =  vecallocd( np );
    tw = vecallocd( 2 * np + p );
    rho_np = vecalloci( np );
    rho_p =  vecalloci( p );

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        x[2 * j] = ( double )jglob;
        x[2 * j + 1] = 1.0;
    }

    bsp_sync();
    time0 = bsp_time();

    /* Initialize the weight and bit reversal tables */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time1 = bsp_time();

    /* Perform the FFTs */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p );
        bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time2 = bsp_time();

    /* Compute the accuracy */
    max_error = 0.0;

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        error_re = fabs( x[2 * j] - ( double )jglob );
        error_im = fabs( x[2 * j + 1] - 1.0 );
        error = sqrt( error_re * error_re + error_im * error_im );

        if ( error > max_error )
        {
            max_error = error;
        }
    }

    bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        max_error = 0.0;

        for ( q = 0; q < p; q++ )
        {
            if ( Error[q] > max_error )
            {
                max_error = Error[q];
            }
        }
    }

    for ( j = 0; j < NPRINT && j < np; j++ )
    {
        jglob = j * p + s;
        printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] );
    }

    fflush( stdout );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Time per initialization = %lf sec \n",
                ( time1 - time0 ) / NITERS );
        ffttime = ( time2 - time1 ) / ( 2.0 * NITERS );
        printf( "Time per FFT = %lf sec \n", ffttime );
        nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n;
        printf( "Computing rate in FFT = %lf Mflop/s \n",
                nflops / ( MEGA * ffttime ) );
        printf( "Absolute error= %e \n", max_error );
        printf( "Relative error= %e \n\n", max_error / n );
    }


    bsp_pop_reg( x );
    bsp_pop_reg( Error );
    bsp_pop_reg( &n );
    bsp_sync();

    vecfreei( rho_p );
    vecfreei( rho_np );
    vecfreed( tw );
    vecfreed( w );
    vecfreed( w0 );
    vecfreed( x );
    vecfreed( Error );
    bsp_end();

} /* end bspfft_test */
示例#6
0
void bspbench(){
    void leastsquares(int h0, int h1, double *t, double *g, double *l);
    int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH];
    double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest,
           time0, time1, time, *Time, mintime, maxtime,
           nflops, r, g0, l0, g, l, t[MAXH+1]; 
  
    size_t pin[100];

    // Determine p 
    // start: new code for pinning
    for (i=0; i< tnode->length; i++) pin[i] = tnode->sons[i]->index;
    mcbsp_set_pinning( pin, tnode->length );
    bsp_begin(tnode->length);
    // end: new code for pinning

    p= bsp_nprocs(); // p = number of processors obtained 
    s= bsp_pid();    // s = processor number

    Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL);
    dest= vecallocd(2*(MAXH+p)); bsp_push_reg(dest,(2*(MAXH+p))*SZDBL);
    bsp_sync();

    // Determine r 

    for (n=1; n < MAXN; n *= 2){
        // Initialize scalars and vectors 
        alpha= 1.0/3.0;
        beta= 4.0/9.0;
        for (i=0; i<n; i++){
          z[i]= y[i]= x[i]= (double)i;
        }
        // Measure time of 2*NITERS DAXPY operations of length n 
        time0=bsp_time();
        for (iter=0; iter<NITERS; iter++){
          for (i=0; i<n; i++)
            y[i] += alpha*x[i];
          for (i=0; i<n; i++)
            z[i] -= beta*x[i];
        }
        time1= bsp_time(); 
        time= time1-time0; 
        bsp_put(0,&time,Time,s*SZDBL,SZDBL);
        bsp_sync();

        // Processor 0 determines minimum, maximum, average computing rate 
        if (s==0){
          mintime= maxtime= Time[0];
          for(s1=1; s1<p; s1++){
            mintime= MIN(mintime,Time[s1]);
            maxtime= MAX(maxtime,Time[s1]);
          }
          if (mintime>0.0){
            // Compute r = average computing rate in flop/s 
            nflops= 4*NITERS*n;
            r= 0.0;
            for(s1=0; s1<p; s1++)
              r += nflops/Time[s1];
            r /= p; 

            //printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ",
            //       n, nflops/(maxtime*MEGA),nflops/(mintime*MEGA), r/MEGA);
            //fflush(stdout);
            // Output for fooling benchmark-detecting compilers 
            printf( "", y[n-1]+z[n-1] );
          } 
        }
      }

      // Determine g and l 
      for (h=0; h<=MAXH; h++){
        // Initialize communication pattern 
        for (i=0; i<h; i++){
          src[i]= (double)i;
          if (p==1){
            destproc[i]=0;
            destindex[i]=i;
          } else {
            // destination processor is one of the p-1 others 
            destproc[i]= (s+1 + i%(p-1)) %p;
            // destination index is in my own part of dest 
            destindex[i]= s + (i/(p-1))*p;
          }
        }
        for (i=0; i<h; i++){
          src[i]= (double)i;
          if (p==1){
            destproc[i]=0;
            destindex[i]=i;
          } else {
            // destination processor is one of the p-1 others 
            destproc[i]= (s+1 + i%(p-1)) %p;
            // destination index is in my own part of dest 
            destindex[i]= s + (i/(p-1))*p;
          }
        }


        // Measure time of NITERS h-relations 
        bsp_sync(); 

        time0= bsp_time(); 
        for (iter=0; iter<NITERS; iter++){
          for (i=0; i<h; i++) {
            //bsp_get(0,  dest, destindex[i]*SZDBL, &src[i] , SZDBL);
            //bsp_get(destproc[i],  dest, destindex[i]*SZDBL, &src[i] , SZDBL);
            bsp_put(destproc[i],  &src[i] , dest              , destindex[i]*SZDBL, SZDBL);
          }

          //if (s == 0) 
          //  bsp_get(0,  dest, destindex[i]*SZDBL, &src[i] , SZDBL);

          bsp_sync(); 
          
        }

        time1= bsp_time();
        time= time1-time0;

        // Compute time of one h-relation 
        if (s==0){
          t[h]= (time*r)/NITERS;
//#define SEHLOC_BENCH_VERBOSE
#ifdef SEHLOC_BENCH_VERBOSE
          char strnodes[256];
          sprintf(strnodes, "");
          for (i=0; i<tnode->length; i++) {
            sprintf(strnodes, "%s %d", strnodes, tnode->sons[i]->index);
          }
          printf("SEH# Level%d %5d %lf %8.0lf\n", tnode->level, h, time/NITERS, t[h]); fflush(stdout);
#endif
        }
      }

      if (s==0){
        leastsquares(0,p,t,&g0,&l0); 
        printf("Range h=0 to p   : g= %.1lf, l= %.1lf\n",g0,l0);
        leastsquares(p,MAXH,t,&g,&l);
        g=(g>0)? g: g0*2;
        printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l);
        //printf("plot# %d %.1lf  %.1lf\n",tnode->level, g,l);
        printf("The bottom line for this MultiBSP component is:\n");
        printf("<p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf>\n",
               p,r/MEGA,g,l);
        fflush(stdout);
      }
      bsp_pop_reg(dest); vecfreed(dest);
      bsp_pop_reg(Time); vecfreed(Time);
      bsp_end();

} /* end bspbench */