int main(int argc,char **argv) { #ifndef SINGLE_PREC double *A,*B,*p,*C; #else float *A,*B,*p,*C; #endif int i,j,k,x,y,z,nx,ny,nz,proc_id,nproc,dims[2],ndim,nu; int istart[3],isize[3],iend[3]; int fstart[3],fsize[3],fend[3]; int iproc,jproc,ng[3],kmax,iex,conf,m,n; long int Nglob,Ntot; double pi,twopi,sinyz; double *sinx,*siny,*sinz,factor; double rtime1,rtime2,gt[12],gt1[12],gt2[12],timers[12]; double tcomm,gtcomm[3]; double cdiff,ccdiff,ans,prec; FILE *fp; unsigned char op_f[]="fft", op_b[]="tff"; int memsize[3]; #ifndef SINGLE_PREC void print_all(double *,long int,int,long int),mult_array(double *,long int,double); #else void print_all(float *,long int,int,long int),mult_array(float *,long int,double); #endif MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nproc); MPI_Comm_rank(MPI_COMM_WORLD,&proc_id); pi = atan(1.0)*4.0; twopi = 2.0*pi; for(i=0; i< 12; i++) { gt[i] = 0.0; gt1[i] = 0.0; gt2[i] = 1E10; } Cset_timers(); if(proc_id == 0) { if((fp=fopen("stdin", "r"))==NULL){ printf("Cannot open file. Setting to default nx=ny=nz=128, ndim=2, n=1.\n"); nx=ny=nz=128; n=1; } else { fscanf(fp,"%d %d %d %d %d\n",&nx,&ny,&nz,&ndim,&n); fclose(fp); } #ifndef SINGLE_PREC printf("Double precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n); #else printf("Single precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n); #endif } MPI_Bcast(&nx,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&ny,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&nz,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&ndim,1,MPI_INT,0,MPI_COMM_WORLD); if(ndim == 1) { dims[0] = 1; dims[1] = nproc; } else if(ndim == 2) { fp = fopen("dims","r"); if(fp != NULL) { if(proc_id == 0) printf("Reading proc. grid from file dims\n"); fscanf(fp,"%d %d\n",dims,dims+1); fclose(fp); if(dims[0]*dims[1] != nproc) dims[1] = nproc / dims[0]; } else { if(proc_id == 0) printf("Creating proc. grid with mpi_dims_create\n"); dims[0]=dims[1]=0; MPI_Dims_create(nproc,2,dims); if(dims[0] > dims[1]) { dims[0] = dims[1]; dims[1] = nproc/dims[0]; } } } if(proc_id == 0) printf("Using processor grid %d x %d\n",dims[0],dims[1]); /* Initialize P3DFFT */ Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize); /* Get dimensions for input array - real numbers, X-pencil shape. Note that we are following the Fortran ordering, i.e. the dimension with stride-1 is X. */ /* printf("Calling get_dims 1\n"); */ conf = 1; Cp3dfft_get_dims(istart,iend,isize,conf); /* Get dimensions for output array - complex numbers, Z-pencil shape. Stride-1 dimension could be X or Z, depending on how the library was compiled (stride1 option) */ /* printf("Calling get_dims 2\n"); */ conf = 2; Cp3dfft_get_dims(fstart,fend,fsize,conf); /* printf("Allocating\n"); */ /* Allocate and Initialize */ #ifndef SINGLE_PREC A = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]); B = (double *) malloc(sizeof(double) * fsize[0]*fsize[1]*fsize[2]*2); C = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]); #else A = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]); B = (float *) malloc(sizeof(float) * fsize[0]*fsize[1]*fsize[2]*2); C = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]); #endif if(A == NULL) printf("%d: Error allocating array A (%d)\n",proc_id,isize[0]*isize[1]*isize[2]); if(B == NULL) printf("%d: Error allocating array B (%d)\n",proc_id,fsize[0]*fsize[1]*fsize[2]*2); if(C == NULL) printf("%d: Error allocating array C (%d)\n",proc_id,isize[0]*isize[1]*isize[2]); /* printf("Initializing\n"); */ sinx = malloc(sizeof(double)*nx); siny = malloc(sizeof(double)*ny); sinz = malloc(sizeof(double)*nz); for(z=0;z < isize[2];z++) sinz[z] = sin((z+istart[2]-1)*twopi/nz); for(y=0;y < isize[1];y++) siny[y] = sin((y+istart[1]-1)*twopi/ny); for(x=0;x < isize[0];x++) sinx[x] = sin((x+istart[0]-1)*twopi/nx); p = A; for(z=0;z < isize[2];z++) for(y=0;y < isize[1];y++) { sinyz = siny[y]*sinz[z]; for(x=0;x < isize[0];x++) *p++ = sinx[x]*sinyz; } Ntot = fsize[0]*fsize[1]; Ntot *= fsize[2]*2; Nglob = nx * ny; Nglob *= nz; factor = 1.0/Nglob; rtime1 = 0.0; for(m=0;m < n;m++) { if(proc_id == 0) printf("Iteration %d\n",m); MPI_Barrier(MPI_COMM_WORLD); rtime1 = rtime1 - MPI_Wtime(); /* compute forward Fourier transform on A, store results in B */ Cp3dfft_ftran_r2c(A,B,op_f); rtime1 = rtime1 + MPI_Wtime(); if(proc_id == 0) printf("Result of forward transform\n"); print_all(B,Ntot,proc_id,Nglob); /* normalize */ mult_array(B,Ntot,factor); /* Compute backward transform on B, store results in C */ MPI_Barrier(MPI_COMM_WORLD); rtime1 = rtime1 - MPI_Wtime(); Cp3dfft_btran_c2r(B,C,op_b); rtime1 = rtime1 + MPI_Wtime(); } /* free work space */ Cp3dfft_clean(); /* Check results */ cdiff = 0.0; p = C; for(z=0;z < isize[2];z++) for(y=0;y < isize[1];y++) { sinyz =siny[y]*sinz[z]; for(x=0;x < isize[0];x++) { ans = sinx[x]*sinyz; if(cdiff < fabs(*p - ans)) cdiff = fabs(*p - ans); p++; } } Cget_timers(timers); #ifndef SINGLE_PREC MPI_Reduce(&cdiff,&ccdiff,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); #else MPI_Reduce(&cdiff,&ccdiff,1,MPI_REAL,MPI_MAX,0,MPI_COMM_WORLD); #endif if(proc_id == 0) { #ifndef SINGLE_PREC prec = 1.0e-14; #else prec = 1.0e-5; #endif if(ccdiff > prec * Nglob*0.25) printf("Results are incorrect\n"); else printf("Results are correct\n"); printf("max diff =%g\n",ccdiff); } /* Gather timing statistics */ MPI_Reduce(&rtime1,&rtime2,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); for (i=0;i < 12;i++) { timers[i] = timers[i] / ((double) n); } MPI_Reduce(&timers,>,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Reduce(&timers,>1,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&timers,>2,12,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD); tcomm = (timers[1]+timers[2]+timers[3]+timers[4]); MPI_Reduce(&timers,>,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); for (i=0;i < 12;i++) { gt[i] = gt[i]/ ((double) nproc); } if(proc_id == 0) { printf("Time per loop=%lg\n",rtime2/((double) n)); for(i=0;i < 12;i++) { printf("timer[%d] (avg/max/min): %lE %lE %lE\n",i+1,gt[i],gt1[i],gt2[i]); } } MPI_Finalize(); }
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){ int nx,ny,nz,procid,nprocs,ndim; int istart[3],isize[3],iend[3]; int fstart[3],fsize[3],fend[3]; int p3dfft_mem_conf,nrep; long int Nlocal,Nglob; double factor; double l_timers[12]={0},g_timers[12]={0}; double total_time=0*MPI_Wtime(), setup_time=0; // rtime_local is timings on each process and _global is the max reduced to root // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root. // the sum of first four elements give the comm time unsigned char op_f[4]="fft", op_b[4]="tff"; int memsize[3]; MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&procid); nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP; if(decomp==1){ dims[0] = 1; dims[1] = nprocs; } if(procid == 0) printf("Using processor grid %d x %d\n",dims[0],dims[1]); /* Initialize P3DFFT */ MPI_Barrier(MPI_COMM_WORLD); setup_time -= MPI_Wtime(); //Compute Setup Time. Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize); setup_time += MPI_Wtime(); //Compute Setup Time. PCOUT<<"done with setup"<<std::endl; Cp3dfft_get_dims(istart,iend,isize,1); Cp3dfft_get_dims(fstart,fend,fsize,2); /* Allocate and initialize */ double *A; // Input matrix A A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); /* Warmup */ Cp3dfft_ftran_r2c(A,A,op_f); Cp3dfft_ftran_r2c(A,A,op_f); MPI_Barrier(MPI_COMM_WORLD); Cset_timers(); for (int rep=0; rep<nrep; rep++){ initialize_p3dfft(A,n); MPI_Barrier(MPI_COMM_WORLD); /* Forward transform */ total_time -= MPI_Wtime(); Cp3dfft_ftran_r2c(A,A,op_f); total_time += MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); } Cget_timers(l_timers); Cp3dfft_btran_c2r(A,A,op_b); /* Compute Error */ //PCOUT<<"Done With FFTs computing error"<<std::endl; compute_error_p3dfft(A,n); /* Gather timing statistics */ double g_total_time, g_comm_time, g_setup_time; MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); g_total_time=g_total_time/nrep; g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep); //g_total_time=g_total_time/((double)nrep); ptrdiff_t size=n[0];size*=n[1]; size*=n[2]; double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9; if(procid == 0){ std::cout.precision(4); std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;; std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl; std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl; std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl; std::cout<<"\033[1;31m"; std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl; std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; std::cout<<"\033[0m\n"<<std::endl; results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; } /* Free work space */ fftw_free(A); Cp3dfft_clean(); }