Example #1
0
int main(int argc,char **argv)
{
#ifndef SINGLE_PREC
   double *A,*B,*p,*C;
#else
   float *A,*B,*p,*C;
#endif
   int i,j,k,x,y,z,nx,ny,nz,proc_id,nproc,dims[2],ndim,nu;
   int istart[3],isize[3],iend[3];
   int fstart[3],fsize[3],fend[3];
   int iproc,jproc,ng[3],kmax,iex,conf,m,n;
   long int Nglob,Ntot;
   double pi,twopi,sinyz;
   double *sinx,*siny,*sinz,factor;
   double rtime1,rtime2,gt[12],gt1[12],gt2[12],timers[12];
   double tcomm,gtcomm[3];
   double cdiff,ccdiff,ans,prec;
   FILE *fp;
   unsigned char op_f[]="fft", op_b[]="tff";
   int memsize[3];

#ifndef SINGLE_PREC
   void print_all(double *,long int,int,long int),mult_array(double *,long int,double);
#else
   void print_all(float *,long int,int,long int),mult_array(float *,long int,double);
#endif

   MPI_Init(&argc,&argv);
   MPI_Comm_size(MPI_COMM_WORLD,&nproc);
   MPI_Comm_rank(MPI_COMM_WORLD,&proc_id);

   pi = atan(1.0)*4.0;
   twopi = 2.0*pi;

   for(i=0; i< 12; i++) {
     gt[i] = 0.0;
     gt1[i] = 0.0;
     gt2[i] = 1E10;
   }

   Cset_timers();

   if(proc_id == 0) {
     if((fp=fopen("stdin", "r"))==NULL){
        printf("Cannot open file. Setting to default nx=ny=nz=128, ndim=2, n=1.\n");
        nx=ny=nz=128; n=1;
     } else {
        fscanf(fp,"%d %d %d %d %d\n",&nx,&ny,&nz,&ndim,&n);
        fclose(fp);
     }
#ifndef SINGLE_PREC
     printf("Double precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n);
#else
     printf("Single precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n);
#endif
   }
   MPI_Bcast(&nx,1,MPI_INT,0,MPI_COMM_WORLD);
   MPI_Bcast(&ny,1,MPI_INT,0,MPI_COMM_WORLD);
   MPI_Bcast(&nz,1,MPI_INT,0,MPI_COMM_WORLD);
   MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD);
   MPI_Bcast(&ndim,1,MPI_INT,0,MPI_COMM_WORLD);
   
   if(ndim == 1) {
     dims[0] = 1; dims[1] = nproc;
   }
   else if(ndim == 2) {
     fp = fopen("dims","r");
     if(fp != NULL) {
       if(proc_id == 0)
         printf("Reading proc. grid from file dims\n");
       fscanf(fp,"%d %d\n",dims,dims+1);
       fclose(fp);
       if(dims[0]*dims[1] != nproc) 
          dims[1] = nproc / dims[0];
     }
     else {
       if(proc_id == 0) 
          printf("Creating proc. grid with mpi_dims_create\n");
       dims[0]=dims[1]=0;
       MPI_Dims_create(nproc,2,dims);
       if(dims[0] > dims[1]) {
          dims[0] = dims[1];
          dims[1] = nproc/dims[0];
       }
     }
   }

   if(proc_id == 0) 
      printf("Using processor grid %d x %d\n",dims[0],dims[1]);

   /* Initialize P3DFFT */
   Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize);
   /* Get dimensions for input array - real numbers, X-pencil shape.
      Note that we are following the Fortran ordering, i.e. 
      the dimension  with stride-1 is X. */
   /*   printf("Calling get_dims 1\n"); */
   conf = 1;
   Cp3dfft_get_dims(istart,iend,isize,conf);
   /* Get dimensions for output array - complex numbers, Z-pencil shape.
      Stride-1 dimension could be X or Z, depending on how the library 
      was compiled (stride1 option) */
   /*   printf("Calling get_dims 2\n"); */
   conf = 2;
   Cp3dfft_get_dims(fstart,fend,fsize,conf);

   /*   printf("Allocating\n"); */

   /* Allocate and Initialize */
#ifndef SINGLE_PREC
   A = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]);
   B = (double *) malloc(sizeof(double) * fsize[0]*fsize[1]*fsize[2]*2);
   C = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]);
#else
   A = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]);
   B = (float *) malloc(sizeof(float) * fsize[0]*fsize[1]*fsize[2]*2);
   C = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]);
#endif

   if(A == NULL) 
     printf("%d: Error allocating array A (%d)\n",proc_id,isize[0]*isize[1]*isize[2]);

   if(B == NULL) 
     printf("%d: Error allocating array B (%d)\n",proc_id,fsize[0]*fsize[1]*fsize[2]*2);

   if(C == NULL) 
     printf("%d: Error allocating array C (%d)\n",proc_id,isize[0]*isize[1]*isize[2]);

   /*   printf("Initializing\n"); */

   sinx = malloc(sizeof(double)*nx);
   siny = malloc(sizeof(double)*ny);
   sinz = malloc(sizeof(double)*nz);

   for(z=0;z < isize[2];z++)
     sinz[z] = sin((z+istart[2]-1)*twopi/nz);
   for(y=0;y < isize[1];y++)
     siny[y] = sin((y+istart[1]-1)*twopi/ny);
   for(x=0;x < isize[0];x++)
     sinx[x] = sin((x+istart[0]-1)*twopi/nx);

   p = A;
   for(z=0;z < isize[2];z++)
     for(y=0;y < isize[1];y++) {
       sinyz = siny[y]*sinz[z];
       for(x=0;x < isize[0];x++)
          *p++ = sinx[x]*sinyz;
     }

   Ntot = fsize[0]*fsize[1];
   Ntot *= fsize[2]*2;
   Nglob = nx * ny;
   Nglob *= nz;
   factor = 1.0/Nglob;

   rtime1 = 0.0;
   for(m=0;m < n;m++) {

     if(proc_id == 0) 
        printf("Iteration %d\n",m);
     MPI_Barrier(MPI_COMM_WORLD);
     rtime1 = rtime1 - MPI_Wtime();
     /* compute forward Fourier transform on A, store results in B */
     Cp3dfft_ftran_r2c(A,B,op_f);
     rtime1 = rtime1 + MPI_Wtime();

     if(proc_id == 0) 
        printf("Result of forward transform\n");

     print_all(B,Ntot,proc_id,Nglob);
     /* normalize */
     mult_array(B,Ntot,factor);

     /* Compute backward transform on B, store results in C */
     MPI_Barrier(MPI_COMM_WORLD);
     rtime1 = rtime1 - MPI_Wtime();
     Cp3dfft_btran_c2r(B,C,op_b);
     rtime1 = rtime1 + MPI_Wtime();

   } 
   /* free work space */
  Cp3dfft_clean();
  
  /* Check results */
  cdiff = 0.0; p = C;
  for(z=0;z < isize[2];z++)
    for(y=0;y < isize[1];y++)  {
       sinyz =siny[y]*sinz[z];
       for(x=0;x < isize[0];x++) {
          ans = sinx[x]*sinyz;
          if(cdiff < fabs(*p - ans))
           cdiff = fabs(*p - ans);
	   p++;
        }
    }

   Cget_timers(timers);

#ifndef SINGLE_PREC
   MPI_Reduce(&cdiff,&ccdiff,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
#else
   MPI_Reduce(&cdiff,&ccdiff,1,MPI_REAL,MPI_MAX,0,MPI_COMM_WORLD);
#endif

  if(proc_id == 0) {
#ifndef SINGLE_PREC
    prec = 1.0e-14;
#else
    prec = 1.0e-5;
#endif
    if(ccdiff > prec * Nglob*0.25)
      printf("Results are incorrect\n");
    else
      printf("Results are correct\n");

    printf("max diff =%g\n",ccdiff);
  }


  /* Gather timing statistics */
  MPI_Reduce(&rtime1,&rtime2,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);

  for (i=0;i < 12;i++) {
    timers[i] = timers[i] / ((double) n);
  }

  MPI_Reduce(&timers,&gt,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
  MPI_Reduce(&timers,&gt1,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&timers,&gt2,12,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD);

  tcomm = (timers[1]+timers[2]+timers[3]+timers[4]);

  MPI_Reduce(&timers,&gt,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);

  for (i=0;i < 12;i++) {
    gt[i] = gt[i]/ ((double) nproc);
  }

  if(proc_id == 0) {
     printf("Time per loop=%lg\n",rtime2/((double) n));
     for(i=0;i < 12;i++) {
       printf("timer[%d] (avg/max/min): %lE %lE %lE\n",i+1,gt[i],gt1[i],gt2[i]);
     }
  }


  MPI_Finalize();

}
Example #2
0
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){

  int nx,ny,nz,procid,nprocs,ndim;
  int istart[3],isize[3],iend[3];
  int fstart[3],fsize[3],fend[3];
  int p3dfft_mem_conf,nrep;
  long int Nlocal,Nglob;
  double factor;
  double l_timers[12]={0},g_timers[12]={0};
  double total_time=0*MPI_Wtime(), setup_time=0;
  // rtime_local is timings on each process and _global is the max reduced to root
  // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time
  // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root.
  // the sum of first four elements give the comm time
  unsigned char op_f[4]="fft", op_b[4]="tff";
  int memsize[3];

  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&procid);



  nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP;

  if(decomp==1){
    dims[0] = 1; dims[1] = nprocs;
  }
  if(procid == 0)
    printf("Using processor grid %d x %d\n",dims[0],dims[1]);

  /* Initialize P3DFFT */
  MPI_Barrier(MPI_COMM_WORLD);
  setup_time -= MPI_Wtime(); //Compute Setup Time.
  Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize);
  setup_time +=  MPI_Wtime(); //Compute Setup Time.
  PCOUT<<"done with setup"<<std::endl;

  Cp3dfft_get_dims(istart,iend,isize,1);
  Cp3dfft_get_dims(fstart,fend,fsize,2);
  /* Allocate and initialize */

  double *A; // Input matrix A
  A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));
  //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));

  /* Warmup */
  Cp3dfft_ftran_r2c(A,A,op_f);
  Cp3dfft_ftran_r2c(A,A,op_f);
  MPI_Barrier(MPI_COMM_WORLD);
  Cset_timers();

  for (int rep=0; rep<nrep; rep++){
    initialize_p3dfft(A,n);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Forward transform */
    total_time -=  MPI_Wtime();
    Cp3dfft_ftran_r2c(A,A,op_f);
    total_time +=  MPI_Wtime();

    MPI_Barrier(MPI_COMM_WORLD);
  }

  Cget_timers(l_timers);
  Cp3dfft_btran_c2r(A,A,op_b);

  /* Compute Error */
  //PCOUT<<"Done With FFTs computing error"<<std::endl;
  compute_error_p3dfft(A,n);

  /* Gather timing statistics */
  double g_total_time, g_comm_time, g_setup_time;

  MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);


  g_total_time=g_total_time/nrep;
  g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep);
  //g_total_time=g_total_time/((double)nrep);
  ptrdiff_t size=n[0];size*=n[1]; size*=n[2];
  double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9;

  if(procid == 0){
    std::cout.precision(4);
    std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;;
    std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl;
    std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl;
    std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl;
    std::cout<<"\033[1;31m";
    std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl;
    std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
    std::cout<<"\033[0m\n"<<std::endl;

    results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
  }
  /* Free work space */
  fftw_free(A);
  Cp3dfft_clean();

}