int main(int argc, char *argv[]) { /******************************************************************************* * Variables for p3dfft * nproc : number of processors * proc_id : rank of a processor * conf : option of p3dfft_get_dims * pk,pj : index of the processor grid * P1, P2 : dimension of the processor grid * istart, isize, iend, fstart, fsize, fend, see p3dfft_get_dims * NORTH, NE, EAST, SE, SOUTH, SW, WEST, NW: rank of neighbouring processor * dimsSbuffer: dimension of send buffer * dimsRbuffer: dimension of recv buffer * opf : option for carrying out fft *******************************************************************************/ int nproc, proc_id, conf, pk, pj, P1, P2; int dims[2], memsize[3]; int istart[3], isize[3], iend[3]; int fstart[3], fsize[3], fend[3]; int NORTH, NE, EAST, SE, SOUTH, SW, WEST, NW; double *N_Recv, *NE_Recv, *E_Recv, *SE_Recv, *S_Recv, *SW_Recv, *W_Recv, *NW_Recv; double *N_Send, *NE_Send, *E_Send, *SE_Send, *S_Send, *SW_Send, *W_Send, *NW_Send; int dimSbuffer[3], dimRbuffer[3]; unsigned char op_f[3]="fft"; /******************************************************************************* * M : resolution of the Fourier modes, -M/2 ... M/2-1 * R : over sampling ratio * Mr : oversampled grid resolution, resolution for FFT * Msp : spreading radius, Msp = 12 for double precision, Msp = 6 single * tau : shape of the gaussian * L : domain size 2*pi * h : physical grid resolution * n_src : # of sources in a processor * N_src : total # of sources * lnx, lny, lnz: dimension of array in each processor (physical) * lkx, lky, lkz: dimension of array in each processor (Fourier) * mx, my, mz: index of nearest grid (in the global sense) * smx, smy, smz: index of the nearest grid (w.r.t the spreading rectangle) * xj, yj, zj: locations of sources * spread_rect: stores data with halo, lnx x (lny+2*Msp) x (lnz+2*Msp) * local_rect: stores data after spreading, lnx x lny x lnz * output_rect: stores output data, (Nx/2+1) x Ny x Nz ********************************************************************************/ int M, Mr, R, Msp, n_src, N_src; int lnx, lny, lnz, lkx, lky, lkz; int mx, my, mz, smx, smy, smz; double *xj, *yj, *zj; double L, h, tau; double diffx, diffy, diffz, E1, E2x, E2y, E2z; double V0,V1,V2,V3; double *E2xl, *E2yl, *E2zl, *E3, *E4; double *spread_rect, *local_rect, *output_rect; int idx[3], dimSpreadRect[3]; int i,j,k,s, l1; double t1,t2,t3,fft_localt, comm_localt, fft_globalt, comm_globalt; double grid_localt, grid_globalt, totalt; FILE *fp; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &proc_id); MPI_Status status; if (argc != 6){ fprintf(stderr, "must inmput M, R, Msp, P1, P2\n"); MPI_Abort(MPI_COMM_WORLD,1); } M = atoi(argv[1]); R = atoi(argv[2]); Msp = atoi(argv[3]); P1 = atoi(argv[4]); P2 = atoi(argv[5]); /* if (proc_id == 0){ fp = fopen("stdin","r"); fscanf(fp, "%d %d %d %d %d\n", &M, &R, &Msp, &P1, &P2); } MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&R, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&Msp, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&P1, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&P2, 1, MPI_INT, 0, MPI_COMM_WORLD); */ t2 = MPI_Wtime(); // set 8 neighbours pk = proc_id / P1; pj = proc_id % P1; NORTH = mod(pk+1,P2)*P1 + mod(pj ,P1); NE = mod(pk+1,P2)*P1 + mod(pj+1,P1); EAST = mod(pk ,P2)*P1 + mod(pj+1,P1); SE = mod(pk-1,P2)*P1 + mod(pj+1,P1); SOUTH = mod(pk-1,P2)*P1 + mod(pj ,P1); SW = mod(pk-1,P2)*P1 + mod(pj-1,P1); WEST = mod(pk ,P2)*P1 + mod(pj-1,P1); NW = mod(pk+1,P2)*P1 + mod(pj-1,P1); L = 2.0 * M_PI; Mr = M*R; tau = (1.*Msp) / (M*M); h = L / Mr; // precompute E3 and E4 E3 = (double *) malloc( sizeof(double)* (Msp+1) ); E4 = (double *) malloc( sizeof(double)* (M/2+1) ); for (i=0; i<=Msp ; ++i){ E3[i]=exp(-(M_PI*i/Mr)*(M_PI*i/Mr)/tau); } for (i=0; i<=M/2 ; ++i){ E4[i] = exp(tau*i*i); } /* if (proc_id == 0){ // printf("%f\n", E3[0]); printf("E3: "); for (i=0; i<=Msp ; ++i){ printf("%f ",E3[i]); } printf("\n"); printf("E4: "); for (i=0; i<=M/2 ; ++i){ printf("%f ",E4[i]); } printf("\n"); } */ E2xl = (double *) malloc( sizeof(double) * 2*Msp ); E2yl = (double *) malloc( sizeof(double) * 2*Msp ); E2zl = (double *) malloc( sizeof(double) * 2*Msp ); // initialize P3DFFT dims[0] = P1; dims[1] = P2; Cp3dfft_setup(dims,Mr,Mr,Mr,MPI_Comm_c2f(MPI_COMM_WORLD), Mr,Mr,Mr, 0, memsize); // set input dimensions conf = 1; Cp3dfft_get_dims(istart, iend, isize, conf); // set output dimensions conf = 2; Cp3dfft_get_dims(fstart, fend, fsize, conf); /* if (proc_id == 0){ printf("istart: %d %d %d \n", istart[0], istart[1], istart[2]); printf("iend: %d %d %d \n", iend[0], iend[1], iend[2]); printf("isize: %d %d %d \n", isize[0], isize[1], isize[2]); printf("\n"); printf("fstart: %d %d %d \n", fstart[0], fstart[1], fstart[2]); printf("fend: %d %d %d \n", fend[0], fend[1], fend[2]); printf("fsize: %d %d %d \n", fsize[0], fsize[1], fsize[2]); } */ n_src = M * (M/P1) * (M/P2); N_src = n_src * P1 * P2; // allocate memory for sources xj = (double *) malloc( sizeof(double) * n_src ); yj = (double *) malloc( sizeof(double) * n_src ); zj = (double *) malloc( sizeof(double) * n_src ); // generate sources for (k=0; k<M/P2; k++){ for(j=0; j<M/P1; j++){ for (i=0; i<M; i++){ xj[l(i,j,k,M,M/P1)] = i*(2*M_PI/M); yj[l(i,j,k,M,M/P1)] = (j+(istart[1]-1)/R)*(2*M_PI/M); zj[l(i,j,k,M,M/P1)] = (k+(istart[2]-1)/R)*(2*M_PI/M); } } } /* // need to print sources to check FILE *fd_sc = NULL; char filename_sc[256]; snprintf(filename_sc, 256, "sources%02d.txt", proc_id); fd_sc = fopen(filename_sc, "w+"); if (NULL == fd_sc){ printf("Error opening file \n"); return 1; } for (i = 0; i < n_src; ++i){ fprintf(fd_sc, "%1.12f %1.12f %1.12f \n", xj[i], yj[i], zj[i]); } fclose(fd_sc); */ // dimension of local rectangle lnx = isize[0]; lny = isize[1]; lnz = isize[2]; lkx = fsize[0]; lky = fsize[1]; lkz = fsize[2]; /* // one source for now xj[0]= (istart[0]+iend[0])*h/2; yj[0]= (istart[1]+iend[1])*h/2; zj[0]= (istart[2]+iend[2])*h/2; */ /* if (proc_id == 0){ printf("%f, %f, %f\n",xj[0], yj[0], zj[0]); } */ // rectangle for spreading, dimension: nx x (ny_local + 2Msp) x (nz_local+2Msp) spread_rect = (double *) malloc( sizeof(double) * lnx*(lny+2*Msp)*(lnz+2*Msp) ); for (i=0; i<lnx*(lny+2*Msp)*(lnz+2*Msp); ++i) spread_rect[i] = 0.; dimSpreadRect[0]=lnx; dimSpreadRect[1]=lny+2*Msp; dimSpreadRect[2] = lnz+2*Msp; // rectangle for local data, dimension: nx x ny_local x nz_local local_rect = (double *) malloc( sizeof(double) * lnx*lny*lnz ); for (i=0; i<lnx*lny*lnz; ++i) local_rect[i] = 0.; // set dimension of output data output_rect = (double *) malloc( sizeof(double) * fsize[0]*fsize[1]*fsize[2]*2 ); for (i=0; i<lkx*lky*lkz*2; ++i) output_rect[i] = 0.; // allocate buffer size N_Recv = (double *) malloc( sizeof(double) * lnx*lny*Msp ); N_Send = (double *) malloc( sizeof(double) * lnx*lny*Msp ); S_Recv = (double *) malloc( sizeof(double) * lnx*lny*Msp ); S_Send = (double *) malloc( sizeof(double) * lnx*lny*Msp ); W_Recv = (double *) malloc( sizeof(double) * lnx*Msp*lnz ); W_Send = (double *) malloc( sizeof(double) * lnx*Msp*lnz ); E_Recv = (double *) malloc( sizeof(double) * lnx*Msp*lnz ); E_Send = (double *) malloc( sizeof(double) * lnx*Msp*lnz ); NW_Recv = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); NW_Send = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); SW_Recv = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); SW_Send = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); NE_Recv = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); NE_Send = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); SE_Recv = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); SE_Send = (double *) malloc( sizeof(double) * lnx*Msp*Msp ); //if (proc_id == 9){ /* printf("istart[0] = %d, istart[1] = %d, istart[2] = %d\n", istart[0],istart[1], istart[2]); printf("iend[0] = %d, iend[1] = %d, iend[2] = %d\n", iend[0],iend[1], iend[2]); printf("isize[0] = %d, isize[1] = %d, isize[2] = %d\n", isize[0],isize[1], isize[2]); */ // printf(" %d %d \n ", istart[1]-1, istart[2]-1); /* // need to print sources to check FILE *fd = NULL; char filename[256]; snprintf(filename, 256, "output%02d.txt", proc_id); fd = fopen(filename, "w+"); if (NULL == fd){ printf("Error opening file \n"); return 1; } for (i = 0; i < n_src; ++i){ fprintf(fd, "%1.12f %1.12f %1.12f \n", xj[i], yj[i], zj[i]); } fclose(fd); */ t3 = MPI_Wtime(); // for each source double mxh, myh, mzh; double piMtau = M_PI / (Mr * tau); for(s=0; s < n_src ; ++s){ /* // find the closest grid point (in the whole domain) mx = (int) ( xj[s]/h ); my = (int) ( yj[s]/h ); mz = (int) ( zj[s]/h ); */ mx = round( xj[s]/h ); my = round( yj[s]/h ); mz = round( zj[s]/h ); /* if (proc_id == 0){ printf("center: %d %d %d \n", mx, my, mz); }*/ mxh = mx*h; myh = my*h; mzh = mz*h; /* if (proc_id == 0){ printf("center: %.16f %.16f %.16f \n", mxh, myh, mzh); } */ // closest grid point (in spreading rect with halo cells ) smx= mx - (istart[0]-1); smy= my - (istart[1]-1) + Msp; smz= mz - (istart[2]-1) + Msp; diffx = xj[s] - mxh; diffy = yj[s] - myh; diffz = zj[s] - mzh; E1 = exp( -(diffx*diffx+diffy*diffy+diffz*diffz)/(4*tau) ); /* if (proc_id == 0){ printf("E1 = %.16f \n ", E1); } */ E2x = exp( piMtau * diffx ); E2y = exp( piMtau * diffy ); E2z = exp( piMtau * diffz ); /* if (proc_id == 0){ printf("E2x = %.16f, E2y = %.16f, E2z = %.16f \n", E2x, E2y, E2z); } */ E2xl[Msp-1]=1.; E2yl[Msp-1]=1.; E2zl[Msp-1]=1.; for (l1 = 1; l1<=Msp; ++l1 ){ E2xl[l1+(Msp-1)] = E2xl[(l1-1)+(Msp-1)] * E2x; E2yl[l1+(Msp-1)] = E2yl[(l1-1)+(Msp-1)] * E2y; E2zl[l1+(Msp-1)] = E2zl[(l1-1)+(Msp-1)] * E2z; } for (l1 = -1; l1>=-Msp+1; --l1){ E2xl[l1+(Msp-1)] = E2xl[(l1+1)+(Msp-1)] / E2x; E2yl[l1+(Msp-1)] = E2yl[(l1+1)+(Msp-1)] / E2y; E2zl[l1+(Msp-1)] = E2zl[(l1+1)+(Msp-1)] / E2z; } /* if (proc_id == 0){ printf("E2xl: "); for (l1 = 0; l1 < 2*Msp; ++l1){ printf("%.16f ", E2xl[l1]); } printf("\n"); printf("E2yl: "); for (l1 = 0; l1 < 2*Msp; ++l1){ printf("%.16f ", E2yl[l1]); } printf("\n"); printf("E2zl: "); for (l1 = 0; l1 < 2*Msp; ++l1){ printf("%.16f ", E2zl[l1]); } printf("\n"); } */ // build the spreading rectangle V0 = 1. * E1; for (k=-Msp+1; k<=Msp; ++k){ V1 = V0 * E2zl[k+(Msp-1)] * E3[abs(k)]; for (j=-Msp+1; j<=Msp; ++j){ V2 = V1 * E2yl[j+(Msp-1)] * E3[abs(j)]; for (i=-Msp+1; i<=Msp; ++i){ V3 = V2 * E2xl[i+(Msp-1)] * E3[abs(i)]; spread_rect[l(mod(i+smx,Mr),j+smy,k+smz,lnx,lny+2*Msp)] += V3; // spread_rect[l(mod(i+smx,Mr),j+smy,k+smz,lnx,lny+2*Msp)] += exp( -((xj[s]-mxh-i*h)*(xj[s]-mxh-i*h)+ (yj[s]-myh-j*h)*(yj[s]-myh-j*h)+(zj[s]-mzh-k*h)*(zj[s]-mzh-k*h) )/(4*tau) ); } } } }// end of looping sources // printf("proc %d here \n", proc_id); grid_localt = MPI_Wtime() - t3; // copy spreading rectangle to local rectangle idx[0] = 0; idx[1] = Msp; idx[2] = Msp; setSbuffer(local_rect, spread_rect, idx, isize, dimSpreadRect ); // getRbuffer(spread_rect, local_rect, idx, dimSpreadRect, isize); // set North Send buffer idx[0] = 0; idx[1]=Msp; idx[2]=lnz+Msp; dimSbuffer[0] = lnx; dimSbuffer[1]=lny; dimSbuffer[2]=Msp; setSbuffer(N_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set South Send buffer idx[0] = 0; idx[1]=Msp; idx[2]=0; setSbuffer(S_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set West Send buffer idx[0] = 0; idx[1]=0; idx[2]=Msp; dimSbuffer[0] = lnx; dimSbuffer[1]=Msp; dimSbuffer[2]=lnz; setSbuffer(W_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set East Send buffer idx[0] = 0; idx[1]=lny+Msp; idx[2]=Msp; setSbuffer(E_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set NE Send buffer idx[0] = 0; idx[1]=lny+Msp; idx[2]=lnz+Msp; dimSbuffer[0] = lnx; dimSbuffer[1]=Msp; dimSbuffer[2]=Msp; setSbuffer(NE_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set SE send buffer idx[0] = 0; idx[1]=lny+Msp; idx[2]=0; setSbuffer(SE_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set SW send buffer idx[0] = 0; idx[1]=0; idx[2]=0; setSbuffer(SW_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); // set NW send buffer idx[0] = 0; idx[1]=0; idx[2]=lnz+Msp; setSbuffer(NW_Send, spread_rect, idx, dimSbuffer, dimSpreadRect); /* printf("center: %d %d %d \n", mx, my, mz); for(k=0; k<lnz; ++k){ for(j=0; j<lny; ++j){ for(i=0; i<lnx; ++i){ printf("%1.1f ", local_rect[l(i,j,k,lnx,lny)]); } printf("\n"); } printf("\n\n"); } */ /* for (k=0; k < Msp; ++k){ for (j=0; j<Msp; ++j){ for (i=0; i<lnx; ++i){ // SE_Send[l(i,j,k,nx,Msp)] = spread_rect[l(0+i,Msp+lny+j,0+k,nx, lny+2*Msp)]; printf("%1.1f ", SE_Send[l(i,j,k,nx,Msp)]); } printf("\n"); } printf("\n\n\n"); } */ //printf("here %d \n", proc_id ); /* if (proc_id == 3) printf("%d %d %d %d %d %d %d %d \n", NORTH, NE, EAST, SE, SOUTH , SW ,WEST, NW); */ t1 = MPI_Wtime(); // NORTH <-> SOUTH communication // 1st sweep: even row send NORTH, then receive NORTH // odd row receive SOUTH, then send SOUTH if (pk % 2 == 0){ MPI_Send( N_Send, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD); MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD, &status); // printf("proc %d: NORTH sent\n", proc_id); } if (pk % 2 == 1){ MPI_Recv( S_Recv, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD, &status); MPI_Send( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD); // printf("proc %d: SOUTH received\n", proc_id); } if (pk % 2 == 1){ MPI_Send( N_Send, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD); MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD, &status); // printf("proc %d: NORTH sent\n", proc_id); } if (pk % 2 == 0){ MPI_Recv( S_Recv, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD, &status); MPI_Send( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD); // printf("proc %d: SOUTH received\n", proc_id); } // EAST <-> WEST communication // 1st sweep: even column send EAST, then recv EAST // odd column recv WEST, then send WEST if (pj % 2 == 0){ MPI_Send( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD); MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD, &status); } if (pj % 2 == 1){ MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD, &status); MPI_Send( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD); } if (pj % 2 == 1){ MPI_Send( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD); MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD, &status); } if (pj % 2 == 0){ MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD, &status); MPI_Send( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD); } // NE <-> SW communication if (pk % 2 == 0){ MPI_Send(NE_Send, lnx*Msp*Msp, MPI_DOUBLE, NE, 99, MPI_COMM_WORLD); MPI_Recv(NE_Recv, lnx*Msp*Msp, MPI_DOUBLE, NE, 99, MPI_COMM_WORLD, &status); } if (pk % 2 == 1){ MPI_Recv(SW_Recv, lnx*Msp*Msp, MPI_DOUBLE, SW, 99, MPI_COMM_WORLD, &status); MPI_Send(SW_Send, lnx*Msp*Msp, MPI_DOUBLE, SW, 99, MPI_COMM_WORLD); } if (pk % 2 == 1){ MPI_Send(NE_Send, lnx*Msp*Msp, MPI_DOUBLE, NE, 99, MPI_COMM_WORLD); MPI_Recv(NE_Recv, lnx*Msp*Msp, MPI_DOUBLE, NE, 99, MPI_COMM_WORLD, &status); } if (pk % 2 == 0){ MPI_Recv(SW_Recv, lnx*Msp*Msp, MPI_DOUBLE, SW, 99, MPI_COMM_WORLD, &status); MPI_Send(SW_Send, lnx*Msp*Msp, MPI_DOUBLE, SW, 99, MPI_COMM_WORLD); } // NW <-> SE communication if (pk % 2 == 0){ MPI_Send(NW_Send, lnx*Msp*Msp, MPI_DOUBLE, NW, 99, MPI_COMM_WORLD); MPI_Recv(NW_Recv, lnx*Msp*Msp, MPI_DOUBLE, NW, 99, MPI_COMM_WORLD, &status); } if (pk % 2 == 1){ MPI_Recv(SE_Recv, lnx*Msp*Msp, MPI_DOUBLE, SE, 99, MPI_COMM_WORLD, &status); MPI_Send(SE_Send, lnx*Msp*Msp, MPI_DOUBLE, SE, 99, MPI_COMM_WORLD); } if (pk % 2 == 1){ MPI_Send(NW_Send, lnx*Msp*Msp, MPI_DOUBLE, NW, 99, MPI_COMM_WORLD); MPI_Recv(NW_Recv, lnx*Msp*Msp, MPI_DOUBLE, NW, 99, MPI_COMM_WORLD, &status); } if (pk % 2 == 0){ MPI_Recv(SE_Recv, lnx*Msp*Msp, MPI_DOUBLE, SE, 99, MPI_COMM_WORLD, &status); MPI_Send(SE_Send, lnx*Msp*Msp, MPI_DOUBLE, SE, 99, MPI_COMM_WORLD); } comm_localt = MPI_Wtime() - t1; // printf("proc %d, communication time: %f\n", proc_id , comm_localt); /* // SOUTH communication // 1st sweep: even row send SOUTH, odd row recv NORTH if (pk % 2 == 0){ MPI_Send( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD); // printf("proc %d: SOUTH sent\n", proc_id); } if (pk % 2 == 1){ MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD, &status); // printf("proc %d: NORTH received\n", proc_id); } // 2nd sweep: odd row send SOUTH, even row recv NORTH if (pk % 2 == 1){ MPI_Send( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 99, MPI_COMM_WORLD); // printf("proc %d: SOUTH sent\n", proc_id); } if (pk % 2 == 0){ MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 99, MPI_COMM_WORLD, &status); // printf("proc %d: NORTH received\n", proc_id); } // EAST communication // 1st sweep: even column send EAST, odd column recv WEST if (pj % 2 == 0){ MPI_Send( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD); // printf("proc %d: EAST sent\n", proc_id); } if (pj % 2 == 1){ MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD, &status); // printf("proc %d: WEST received\n", proc_id); } // 2nd sweep: odd column send EAST, even column recv WEST if (pj % 2 == 1){ MPI_Send( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD); // printf("proc %d: EAST sent\n", proc_id); } if (pj % 2 == 0){ MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD, &status); // printf("proc %d: WEST received\n", proc_id); } // WEST commnunication // 1st sweep: even column send WEST, odd column recv EAST if (pj % 2 == 0){ MPI_Send( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD); // printf("proc %d: WEST sent\n", proc_id); } if (pj % 2 == 1){ MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD, &status); // printf("proc %d: EAST received\n", proc_id); } // 2nd sweep: odd column send WEST, even column recv EAST if (pj % 2 == 1){ MPI_Send( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 99, MPI_COMM_WORLD); printf("proc %d: WEST sent\n", proc_id); } if (pj % 2 == 0){ MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 99, MPI_COMM_WORLD, &status); printf("proc %d: EAST received\n", proc_id); } */ /* MPI_Request request; // MPI send to neighbours MPI_Isend( N_Send, lnx*lny*Msp, MPI_DOUBLE, NORTH, 1, MPI_COMM_WORLD, &request); MPI_Isend( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 2, MPI_COMM_WORLD, &request); MPI_Isend( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 3, MPI_COMM_WORLD, &request); MPI_Isend( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 4, MPI_COMM_WORLD, &request); MPI_Isend(NE_Send, lnx*Msp*Msp, MPI_DOUBLE, NE, 5, MPI_COMM_WORLD, &request); MPI_Isend(SE_Send, lnx*Msp*Msp, MPI_DOUBLE, SE, 6, MPI_COMM_WORLD, &request); MPI_Isend(SW_Send, lnx*Msp*Msp, MPI_DOUBLE, SW, 7, MPI_COMM_WORLD, &request); MPI_Isend(NW_Send, lnx*Msp*Msp, MPI_DOUBLE, NW, 8, MPI_COMM_WORLD, &request); // MPI receive from neighbours MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 2, MPI_COMM_WORLD, &status); MPI_Recv( S_Recv, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 1, MPI_COMM_WORLD, &status); MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 4, MPI_COMM_WORLD, &status); MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 3, MPI_COMM_WORLD, &status); MPI_Recv(NE_Recv, lnx*Msp*Msp, MPI_DOUBLE, NE, 7, MPI_COMM_WORLD, &status); MPI_Recv(SE_Recv, lnx*Msp*Msp, MPI_DOUBLE, SE, 8, MPI_COMM_WORLD, &status); MPI_Recv(SW_Recv, lnx*Msp*Msp, MPI_DOUBLE, SW, 5, MPI_COMM_WORLD, &status); MPI_Recv(NW_Recv, lnx*Msp*Msp, MPI_DOUBLE, NW, 6, MPI_COMM_WORLD, &status); */ /* // MPI send to neighbours MPI_Isend( N_Send, lnx*lny*Msp, MPI_DOUBLE, NORTH, 0, MPI_COMM_WORLD, &request); MPI_Isend( S_Send, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 0, MPI_COMM_WORLD, &request); MPI_Isend( W_Send, lnx*Msp*lnz, MPI_DOUBLE, WEST, 0, MPI_COMM_WORLD, &request); MPI_Isend( E_Send, lnx*Msp*lnz, MPI_DOUBLE, EAST, 0, MPI_COMM_WORLD, &request); MPI_Isend(NE_Send, lnx*Msp*Msp, MPI_DOUBLE, NE, 0, MPI_COMM_WORLD, &request); MPI_Isend(SE_Send, lnx*Msp*Msp, MPI_DOUBLE, SE, 0, MPI_COMM_WORLD, &request); MPI_Isend(SW_Send, lnx*Msp*Msp, MPI_DOUBLE, SW, 0, MPI_COMM_WORLD, &request); MPI_Isend(NW_Send, lnx*Msp*Msp, MPI_DOUBLE, NW, 0, MPI_COMM_WORLD, &request); // MPI receive from neighbours MPI_Recv( N_Recv, lnx*lny*Msp, MPI_DOUBLE, NORTH, 0, MPI_COMM_WORLD, &status); MPI_Recv( S_Recv, lnx*lny*Msp, MPI_DOUBLE, SOUTH, 0, MPI_COMM_WORLD, &status); MPI_Recv( W_Recv, lnx*Msp*lnz, MPI_DOUBLE, WEST, 0, MPI_COMM_WORLD, &status); MPI_Recv( E_Recv, lnx*Msp*lnz, MPI_DOUBLE, EAST, 0, MPI_COMM_WORLD, &status); MPI_Recv(NE_Recv, lnx*Msp*Msp, MPI_DOUBLE, NE, 0, MPI_COMM_WORLD, &status); MPI_Recv(SE_Recv, lnx*Msp*Msp, MPI_DOUBLE, SE, 0, MPI_COMM_WORLD, &status); MPI_Recv(SW_Recv, lnx*Msp*Msp, MPI_DOUBLE, SW, 0, MPI_COMM_WORLD, &status); MPI_Recv(NW_Recv, lnx*Msp*Msp, MPI_DOUBLE, NW, 0, MPI_COMM_WORLD, &status); */ // copy receive buffer to local rectangle // add contribution from N buffer idx[0]=0; idx[1]=0; idx[2]=lnz-Msp; dimRbuffer[0]=lnx; dimRbuffer[1]=lny; dimRbuffer[2]=Msp; getRbuffer( N_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from S buffer idx[0]=0; idx[1]=0; idx[2]=0; getRbuffer( S_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from W buffer idx[0]=0; idx[1]=0; idx[2]=0; dimRbuffer[0]=lnx; dimRbuffer[1]=Msp; dimRbuffer[2]=lnz; getRbuffer( W_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from E buffer idx[0]=0; idx[1]=lny-Msp; idx[2]=0; getRbuffer( E_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution form NW buffer idx[0]=0; idx[1]=0; idx[2]=lnz-Msp; dimRbuffer[0]=lnx; dimRbuffer[1]=Msp; dimRbuffer[2]=Msp; getRbuffer( NW_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from SW buffer idx[0]=0; idx[1]=0; idx[2]=0; getRbuffer( SW_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from SE buffer idx[0]=0; idx[1]=lny-Msp; idx[2]=0; getRbuffer( SE_Recv, local_rect, idx, dimRbuffer, isize ); // add contribution from NE buffer idx[0]=0; idx[1]=lny-Msp; idx[2]=lnz-Msp; getRbuffer( NE_Recv, local_rect, idx, dimRbuffer, isize ); /* if (proc_id == 0){ for(k=0; k<lnz+2*Msp; ++k){ for(j=0; j<lny+2*Msp; ++j){ for(i=0; i<lnx; ++i){ printf("%1.12f ", spread_rect[l(i,j,k,lnx,lny+2*Msp)]); } printf("\n"); } printf("\n\n"); } for(k=0; k<lnz; ++k){ for(j=0; j<lny; ++j){ for(i=0; i<lnx; ++i){ printf("%1.12f ", local_rect[l(i,j,k,lnx,lny)]); } printf("\n"); } printf("\n\n"); } } */ /* for(k=0; k<lnz; ++k){ for(j=0; j<lny; ++j){ for(i=0; i<lnx; ++i){ local_rect[l(i,j,k,lnx,lny)] = 1.; } } } */ // step 2: take FFT on local_rect MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); Cp3dfft_ftran_r2c(local_rect, output_rect, op_f); fft_localt = MPI_Wtime()-t1; // printf("proc %d: elapsed time is %f\n", proc_id, fft_localt; /* if (proc_id == 1){ for(k = 0; k<lkz; ++k){ for(j=0; j<lky; ++j){ for(i=0; i<2*lkx; ++i){ printf("%1.12f ", output_rect[l(i,j,k,lkx*2,lky)]); } printf("\n"); } printf("\n \n"); } } */ /* FILE *fd = NULL; char filename[256]; snprintf(filename, 256, "output%02d.txt", proc_id); fd = fopen(filename, "w+"); if (NULL == fd){ printf("Error opening file \n"); return 1; } for(k=0; k<lkz; ++k){ for(j=0; j<lky; ++j){ for(i=0; i<lkx; ++i){ fprintf(fd, "%1.12f %1.12fi \n", output_rect[l(2*i,j,k,lkx*2,lky)],output_rect[l(2*i+1,j,k,lkx*2,lky)]); } } } */ /* for(k=0; k<lnz+2*Msp; ++k){ for(j=0; j<lny+2*Msp; ++j){ for(i=0; i<lnx; ++i){ fprintf(fd, "%1.1f ", spread_rect[l(i,j,k,lnx,lky+2*Msp)]); } fprintf(fd, "\n"); } fprintf(fd , "\n\n"); } fclose(fd); */ t2 = MPI_Wtime() - t2; comm_globalt = 0; fft_globalt = 0; grid_globalt = 0; totalt = 0; MPI_Reduce(&comm_localt, &comm_globalt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&fft_localt, &fft_globalt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&t2, &totalt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&grid_localt, &grid_globalt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (proc_id == 0){ printf("avg gridding time per proc %f\n", grid_globalt/nproc); printf("avg comm time per proc %f\n", comm_globalt/nproc); printf("avg fft time per proc %f\n", fft_globalt/nproc); printf("avg total time proc %f\n", totalt/nproc); } // step 3: Deconvolution // clean up Cp3dfft_clean(); free(xj); free(yj); free(zj); free(E2xl); free(E2yl); free(E2zl); free(E3); free(E4); free(spread_rect); free(local_rect); free(output_rect); free(N_Recv); free(N_Send); free(S_Recv); free(S_Send); free(W_Recv); free(W_Send); free(E_Recv); free(E_Send); free(NW_Recv); free(NW_Send); free(SE_Recv); free(SE_Send); free(SW_Recv); free(SW_Send); free(NE_Recv); free(NE_Send); MPI_Finalize(); return 0; }
int main(int argc,char **argv) { #ifndef SINGLE_PREC double *A,*B,*p,*C; #else float *A,*B,*p,*C; #endif int i,j,k,x,y,z,nx,ny,nz,proc_id,nproc,dims[2],ndim,nu; int istart[3],isize[3],iend[3]; int fstart[3],fsize[3],fend[3]; int iproc,jproc,ng[3],kmax,iex,conf,m,n; long int Nglob,Ntot; double pi,twopi,sinyz; double *sinx,*siny,*sinz,factor; double rtime1,rtime2,gt[12],gt1[12],gt2[12],timers[12]; double tcomm,gtcomm[3]; double cdiff,ccdiff,ans,prec; FILE *fp; unsigned char op_f[]="fft", op_b[]="tff"; int memsize[3]; #ifndef SINGLE_PREC void print_all(double *,long int,int,long int),mult_array(double *,long int,double); #else void print_all(float *,long int,int,long int),mult_array(float *,long int,double); #endif MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nproc); MPI_Comm_rank(MPI_COMM_WORLD,&proc_id); pi = atan(1.0)*4.0; twopi = 2.0*pi; for(i=0; i< 12; i++) { gt[i] = 0.0; gt1[i] = 0.0; gt2[i] = 1E10; } Cset_timers(); if(proc_id == 0) { if((fp=fopen("stdin", "r"))==NULL){ printf("Cannot open file. Setting to default nx=ny=nz=128, ndim=2, n=1.\n"); nx=ny=nz=128; n=1; } else { fscanf(fp,"%d %d %d %d %d\n",&nx,&ny,&nz,&ndim,&n); fclose(fp); } #ifndef SINGLE_PREC printf("Double precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n); #else printf("Single precision\n (%d %d %d) grid\n %d proc. dimensions\n%d repetitions\n",nx,ny,nz,ndim,n); #endif } MPI_Bcast(&nx,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&ny,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&nz,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&ndim,1,MPI_INT,0,MPI_COMM_WORLD); if(ndim == 1) { dims[0] = 1; dims[1] = nproc; } else if(ndim == 2) { fp = fopen("dims","r"); if(fp != NULL) { if(proc_id == 0) printf("Reading proc. grid from file dims\n"); fscanf(fp,"%d %d\n",dims,dims+1); fclose(fp); if(dims[0]*dims[1] != nproc) dims[1] = nproc / dims[0]; } else { if(proc_id == 0) printf("Creating proc. grid with mpi_dims_create\n"); dims[0]=dims[1]=0; MPI_Dims_create(nproc,2,dims); if(dims[0] > dims[1]) { dims[0] = dims[1]; dims[1] = nproc/dims[0]; } } } if(proc_id == 0) printf("Using processor grid %d x %d\n",dims[0],dims[1]); /* Initialize P3DFFT */ Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize); /* Get dimensions for input array - real numbers, X-pencil shape. Note that we are following the Fortran ordering, i.e. the dimension with stride-1 is X. */ /* printf("Calling get_dims 1\n"); */ conf = 1; Cp3dfft_get_dims(istart,iend,isize,conf); /* Get dimensions for output array - complex numbers, Z-pencil shape. Stride-1 dimension could be X or Z, depending on how the library was compiled (stride1 option) */ /* printf("Calling get_dims 2\n"); */ conf = 2; Cp3dfft_get_dims(fstart,fend,fsize,conf); /* printf("Allocating\n"); */ /* Allocate and Initialize */ #ifndef SINGLE_PREC A = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]); B = (double *) malloc(sizeof(double) * fsize[0]*fsize[1]*fsize[2]*2); C = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]); #else A = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]); B = (float *) malloc(sizeof(float) * fsize[0]*fsize[1]*fsize[2]*2); C = (float *) malloc(sizeof(float) * isize[0]*isize[1]*isize[2]); #endif if(A == NULL) printf("%d: Error allocating array A (%d)\n",proc_id,isize[0]*isize[1]*isize[2]); if(B == NULL) printf("%d: Error allocating array B (%d)\n",proc_id,fsize[0]*fsize[1]*fsize[2]*2); if(C == NULL) printf("%d: Error allocating array C (%d)\n",proc_id,isize[0]*isize[1]*isize[2]); /* printf("Initializing\n"); */ sinx = malloc(sizeof(double)*nx); siny = malloc(sizeof(double)*ny); sinz = malloc(sizeof(double)*nz); for(z=0;z < isize[2];z++) sinz[z] = sin((z+istart[2]-1)*twopi/nz); for(y=0;y < isize[1];y++) siny[y] = sin((y+istart[1]-1)*twopi/ny); for(x=0;x < isize[0];x++) sinx[x] = sin((x+istart[0]-1)*twopi/nx); p = A; for(z=0;z < isize[2];z++) for(y=0;y < isize[1];y++) { sinyz = siny[y]*sinz[z]; for(x=0;x < isize[0];x++) *p++ = sinx[x]*sinyz; } Ntot = fsize[0]*fsize[1]; Ntot *= fsize[2]*2; Nglob = nx * ny; Nglob *= nz; factor = 1.0/Nglob; rtime1 = 0.0; for(m=0;m < n;m++) { if(proc_id == 0) printf("Iteration %d\n",m); MPI_Barrier(MPI_COMM_WORLD); rtime1 = rtime1 - MPI_Wtime(); /* compute forward Fourier transform on A, store results in B */ Cp3dfft_ftran_r2c(A,B,op_f); rtime1 = rtime1 + MPI_Wtime(); if(proc_id == 0) printf("Result of forward transform\n"); print_all(B,Ntot,proc_id,Nglob); /* normalize */ mult_array(B,Ntot,factor); /* Compute backward transform on B, store results in C */ MPI_Barrier(MPI_COMM_WORLD); rtime1 = rtime1 - MPI_Wtime(); Cp3dfft_btran_c2r(B,C,op_b); rtime1 = rtime1 + MPI_Wtime(); } /* free work space */ Cp3dfft_clean(); /* Check results */ cdiff = 0.0; p = C; for(z=0;z < isize[2];z++) for(y=0;y < isize[1];y++) { sinyz =siny[y]*sinz[z]; for(x=0;x < isize[0];x++) { ans = sinx[x]*sinyz; if(cdiff < fabs(*p - ans)) cdiff = fabs(*p - ans); p++; } } Cget_timers(timers); #ifndef SINGLE_PREC MPI_Reduce(&cdiff,&ccdiff,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); #else MPI_Reduce(&cdiff,&ccdiff,1,MPI_REAL,MPI_MAX,0,MPI_COMM_WORLD); #endif if(proc_id == 0) { #ifndef SINGLE_PREC prec = 1.0e-14; #else prec = 1.0e-5; #endif if(ccdiff > prec * Nglob*0.25) printf("Results are incorrect\n"); else printf("Results are correct\n"); printf("max diff =%g\n",ccdiff); } /* Gather timing statistics */ MPI_Reduce(&rtime1,&rtime2,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); for (i=0;i < 12;i++) { timers[i] = timers[i] / ((double) n); } MPI_Reduce(&timers,>,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Reduce(&timers,>1,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&timers,>2,12,MPI_DOUBLE,MPI_MIN,0,MPI_COMM_WORLD); tcomm = (timers[1]+timers[2]+timers[3]+timers[4]); MPI_Reduce(&timers,>,12,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); for (i=0;i < 12;i++) { gt[i] = gt[i]/ ((double) nproc); } if(proc_id == 0) { printf("Time per loop=%lg\n",rtime2/((double) n)); for(i=0;i < 12;i++) { printf("timer[%d] (avg/max/min): %lE %lE %lE\n",i+1,gt[i],gt1[i],gt2[i]); } } MPI_Finalize(); }
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){ int nx,ny,nz,procid,nprocs,ndim; int istart[3],isize[3],iend[3]; int fstart[3],fsize[3],fend[3]; int p3dfft_mem_conf,nrep; long int Nlocal,Nglob; double factor; double l_timers[12]={0},g_timers[12]={0}; double total_time=0*MPI_Wtime(), setup_time=0; // rtime_local is timings on each process and _global is the max reduced to root // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root. // the sum of first four elements give the comm time unsigned char op_f[4]="fft", op_b[4]="tff"; int memsize[3]; MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&procid); nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP; if(decomp==1){ dims[0] = 1; dims[1] = nprocs; } if(procid == 0) printf("Using processor grid %d x %d\n",dims[0],dims[1]); /* Initialize P3DFFT */ MPI_Barrier(MPI_COMM_WORLD); setup_time -= MPI_Wtime(); //Compute Setup Time. Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize); setup_time += MPI_Wtime(); //Compute Setup Time. PCOUT<<"done with setup"<<std::endl; Cp3dfft_get_dims(istart,iend,isize,1); Cp3dfft_get_dims(fstart,fend,fsize,2); /* Allocate and initialize */ double *A; // Input matrix A A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); /* Warmup */ Cp3dfft_ftran_r2c(A,A,op_f); Cp3dfft_ftran_r2c(A,A,op_f); MPI_Barrier(MPI_COMM_WORLD); Cset_timers(); for (int rep=0; rep<nrep; rep++){ initialize_p3dfft(A,n); MPI_Barrier(MPI_COMM_WORLD); /* Forward transform */ total_time -= MPI_Wtime(); Cp3dfft_ftran_r2c(A,A,op_f); total_time += MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); } Cget_timers(l_timers); Cp3dfft_btran_c2r(A,A,op_b); /* Compute Error */ //PCOUT<<"Done With FFTs computing error"<<std::endl; compute_error_p3dfft(A,n); /* Gather timing statistics */ double g_total_time, g_comm_time, g_setup_time; MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); g_total_time=g_total_time/nrep; g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep); //g_total_time=g_total_time/((double)nrep); ptrdiff_t size=n[0];size*=n[1]; size*=n[2]; double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9; if(procid == 0){ std::cout.precision(4); std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;; std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl; std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl; std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl; std::cout<<"\033[1;31m"; std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl; std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; std::cout<<"\033[0m\n"<<std::endl; results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; } /* Free work space */ fftw_free(A); Cp3dfft_clean(); }