int main(int argc, char *argv[]) { int indx = 9, indy = 9; int npx = 3072, npy = 3072; int ndim = 2; float tend = 10.0, dt = 0.1, qme = -1.0; float vtx = 1.0, vty = 1.0, vx0 = 0.0, vy0 = 0.0; float ax = .912871, ay = .912871; /* idimp = dimension of phase space = 4 */ int idimp = 4, ipbc = 1; float wke = 0.0, we = 0.0, wt = 0.0; /* sorting tiles, should be less than or equal to 32 */ int mx = 16, my = 16; /* fraction of extra particles needed for particle management */ float xtras = 0.2; /* declare scalars for standard code */ int j; int np, nx, ny, nxh, nyh, nxe, nye, nxeh, nxyh, nxhy; int mx1, my1, mxy1, ntime, nloop, isign; float qbme, affp; /* declare scalars for OpenMP code */ int nppmx, nppmx0, ntmax, npbmx, irc; int nvp; /* declare arrays for standard code */ float *part = NULL; float *qe = NULL; float *fxye = NULL; float complex *ffc = NULL; int *mixup = NULL; float complex *sct = NULL; /* declare arrays for OpenMP (tiled) code */ float *ppart = NULL, *ppbuff = NULL; int *kpic = NULL; int *ncl = NULL; int *ihole = NULL; /* declare and initialize timing data */ float time; struct timeval itime; float tdpost = 0.0, tguard = 0.0, tfft = 0.0, tfield = 0.0; float tpush = 0.0, tsort = 0.0; double dtime; irc = 0; /* nvp = number of shared memory nodes (0=default) */ nvp = 0; /* printf("enter number of nodes:\n"); */ /* scanf("%i",&nvp); */ /* initialize for shared memory parallel processing */ cinit_omp(nvp); /* initialize scalars for standard code */ np = npx*npy; nx = 1L<<indx; ny = 1L<<indy; nxh = nx/2; nyh = ny/2; nxe = nx + 2; nye = ny + 1; nxeh = nxe/2; nxyh = (nx > ny ? nx : ny)/2; nxhy = nxh > ny ? nxh : ny; mx1 = (nx - 1)/mx + 1; my1 = (ny - 1)/my + 1; mxy1 = mx1*my1; nloop = tend/dt + .0001; ntime = 0; qbme = qme; affp = (float) (nx*ny)/(float ) np; /* allocate and initialize data for standard code */ part = (float *) malloc(idimp*np*sizeof(float)); qe = (float *) malloc(nxe*nye*sizeof(float)); fxye = (float *) malloc(ndim*nxe*nye*sizeof(float)); ffc = (float complex *) malloc(nxh*nyh*sizeof(float complex)); mixup = (int *) malloc(nxhy*sizeof(int)); sct = (float complex *) malloc(nxyh*sizeof(float complex)); kpic = (int *) malloc(mxy1*sizeof(int)); /* prepare fft tables */ cwfft2rinit(mixup,sct,indx,indy,nxhy,nxyh); /* calculate form factors */ isign = 0; cmpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay, affp,&we,nx,ny,nxeh,nye,nxh,nyh); /* initialize electrons */ cdistr2(part,vtx,vty,vx0,vy0,npx,npy,idimp,np,nx,ny,ipbc); /* find number of particles in each of mx, my tiles: updates kpic, nppmx */ cdblkp2l(part,kpic,&nppmx,idimp,np,mx,my,mx1,mxy1,&irc); if (irc != 0) { printf("cdblkp2l error, irc=%d\n",irc); exit(1); } /* allocate vector particle data */ nppmx0 = (1.0 + xtras)*nppmx; ntmax = xtras*nppmx; npbmx = xtras*nppmx; ppart = (float *) malloc(idimp*nppmx0*mxy1*sizeof(float)); ppbuff = (float *) malloc(idimp*npbmx*mxy1*sizeof(float)); ncl = (int *) malloc(8*mxy1*sizeof(int)); ihole = (int *) malloc(2*(ntmax+1)*mxy1*sizeof(int)); /* copy ordered particle data for OpenMP: updates ppart and kpic */ cppmovin2l(part,ppart,kpic,nppmx0,idimp,np,mx,my,mx1,mxy1,&irc); if (irc != 0) { printf("cppmovin2l overflow error, irc=%d\n",irc); exit(1); } /* sanity check */ cppcheck2l(ppart,kpic,idimp,nppmx0,nx,ny,mx,my,mx1,my1,&irc); if (irc != 0) { printf("%d,cppcheck2l error: irc=%d\n",ntime,irc); exit(1); } /* * * * start main iteration loop * * * */ L500: if (nloop <= ntime) goto L2000; /* printf("ntime = %i\n",ntime); */ /* deposit charge with OpenMP: updates qe */ dtimer(&dtime,&itime,-1); for (j = 0; j < nxe*nye; j++) { qe[j] = 0.0; } cgppost2l(ppart,qe,kpic,qme,nppmx0,idimp,mx,my,nxe,nye,mx1,mxy1); dtimer(&dtime,&itime,1); time = (float) dtime; tdpost += time; /* add guard cells with OpenMP: updates qe */ dtimer(&dtime,&itime,-1); caguard2l(qe,nx,ny,nxe,nye); dtimer(&dtime,&itime,1); time = (float) dtime; tguard += time; /* transform charge to fourier space with OpenMP: updates qe */ dtimer(&dtime,&itime,-1); isign = -1; cwfft2rmx((float complex *)qe,isign,mixup,sct,indx,indy,nxeh, nye,nxhy,nxyh); dtimer(&dtime,&itime,1); time = (float) dtime; tfft += time; /* calculate force/charge in fourier space with OpenMP: updates fxye, we */ dtimer(&dtime,&itime,-1); isign = -1; cmpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax, ay,affp,&we,nx,ny,nxeh,nye,nxh,nyh); dtimer(&dtime,&itime,1); time = (float) dtime; tfield += time; /* transform force to real space with OpenMP: updates fxye */ dtimer(&dtime,&itime,-1); isign = 1; cwfft2rm2((float complex *)fxye,isign,mixup,sct,indx,indy,nxeh, nye,nxhy,nxyh); dtimer(&dtime,&itime,1); time = (float) dtime; tfft += time; /* copy guard cells with OpenMP: updates fxye */ dtimer(&dtime,&itime,-1); ccguard2l(fxye,nx,ny,nxe,nye); dtimer(&dtime,&itime,1); time = (float) dtime; tguard += time; /* push particles with OpenMP: */ wke = 0.0; dtimer(&dtime,&itime,-1); /* updates ppart, wke */ /* cgppush2l(ppart,fxye,kpic,qbme,dt,&wke,idimp,nppmx0,nx,ny,mx,my, */ /* nxe,nye,mx1,mxy1,ipbc); */ /* updates ppart, ncl, ihole, wke, irc */ cgppushf2l(ppart,fxye,kpic,ncl,ihole,qbme,dt,&wke,idimp,nppmx0, nx,ny,mx,my,nxe,nye,mx1,mxy1,ntmax,&irc); dtimer(&dtime,&itime,1); time = (float) dtime; tpush += time; if (irc != 0) { printf("cgppushf2l error: irc=%d\n",irc); exit(1); } /* reorder particles by tile with OpenMP: */ dtimer(&dtime,&itime,-1); /* updates ppart, ppbuff, kpic, ncl, ihole, and irc */ /* cpporder2l(ppart,ppbuff,kpic,ncl,ihole,idimp,nppmx0,nx,ny,mx,my, */ /* mx1,my1,npbmx,ntmax,&irc); */ /* updates ppart, ppbuff, kpic, ncl, and irc */ cpporderf2l(ppart,ppbuff,kpic,ncl,ihole,idimp,nppmx0,mx1,my1, npbmx,ntmax,&irc); dtimer(&dtime,&itime,1); time = (float) dtime; tsort += time; if (irc != 0) { printf("cpporderf2l error: ntmax, irc=%d,%d\n",ntmax,irc); exit(1); } if (ntime==0) { printf("Initial Field, Kinetic and Total Energies:\n"); printf("%e %e %e\n",we,wke,wke+we); } ntime += 1; goto L500; L2000: /* * * * end main iteration loop * * * */ printf("ntime = %i\n",ntime); printf("Final Field, Kinetic and Total Energies:\n"); printf("%e %e %e\n",we,wke,wke+we); printf("\n"); printf("deposit time = %f\n",tdpost); printf("guard time = %f\n",tguard); printf("solver time = %f\n",tfield); printf("fft time = %f\n",tfft); printf("push time = %f\n",tpush); printf("sort time = %f\n",tsort); tfield += tguard + tfft; printf("total solver time = %f\n",tfield); time = tdpost + tpush + tsort; printf("total particle time = %f\n",time); wt = time + tfield; printf("total time = %f\n",wt); printf("\n"); wt = 1.0e+09/(((float) nloop)*((float) np)); printf("Push Time (nsec) = %f\n",tpush*wt); printf("Deposit Time (nsec) = %f\n",tdpost*wt); printf("Sort Time (nsec) = %f\n",tsort*wt); printf("Total Particle Time (nsec) = %f\n",time*wt); printf("\n"); return 0; }
int main(int argc, char *argv[]) { int indx = 6, indy = 7, npx = 48, npy = 12; float tend = 65.0f, dt = 0.1f; // parse in parameters if(argc == 7) { indx = atoi(argv[1]); indy = atoi(argv[2]); npx = atoi(argv[3]); npy = atoi(argv[4]); tend = atof(argv[5]); dt = atof(argv[6]); if(!(indx && indy && npx && npy && tend && dt)) { printf("One or more parameters are invalid.\n"); exit(1); } } else if(argc != 1) { printf("Usage: %s indx indy npx npy tend dt\n", argv[0]); exit(1); } else { printf("Using default parameters...\n"); } int ndim = 2; float qme = -1.0; float vtx = 1.0, vty = 1.0, vx0 = 0.0, vy0 = 0.0; float ax = .912871, ay = .912871; /* idimp = dimension of phase space = 4 */ /* sortime = number of time steps between standard electron sorting */ int idimp = 4, ipbc = 1, sortime = 50; float wke = 0.0, we = 0.0, wt = 0.0; /* declare scalars for standard code */ int j; int np, nx, ny, nxh, nyh, nxe, nye, nxeh, nxyh, nxhy; int ny1, ntime, nloop, isign; float qbme, affp; /* declare arrays for standard code */ float *part = NULL, *part2 = NULL, *tpart = NULL; float *qe = NULL; float *fxye = NULL; float complex *ffc = NULL; int *mixup = NULL; float complex *sct = NULL; int *npicy = NULL; /* declare and initialize timing data */ float time; struct timeval itime; float tdpost = 0.0, tguard = 0.0, tfft = 0.0, tfield = 0.0; float tpush = 0.0, tsort = 0.0; double dtime; /* initialize scalars for standard code */ np = npx*npy; nx = 1L<<indx; ny = 1L<<indy; nxh = nx/2; nyh = ny/2; nxe = nx + 2; nye = ny + 1; nxeh = nxe/2; nxyh = (nx > ny ? nx : ny)/2; nxhy = nxh > ny ? nxh : ny; ny1 = ny + 1; nloop = tend/dt + .0001; ntime = 0; qbme = qme; affp = (float) (nx*ny)/(float ) np; /* allocate and initialize data for standard code */ part = (float *) malloc(idimp*np*sizeof(float)); part2 = (float *) malloc(idimp*np*sizeof(float)); qe = (float *) malloc(nxe*nye*sizeof(float)); fxye = (float *) malloc(ndim*nxe*nye*sizeof(float)); ffc = (float complex *) malloc(nxh*nyh*sizeof(float complex)); mixup = (int *) malloc(nxhy*sizeof(int)); sct = (float complex *) malloc(nxyh*sizeof(float complex)); npicy = (int *) malloc(ny1*sizeof(int)); /* prepare fft tables */ cwfft2rinit(mixup,sct,indx,indy,nxhy,nxyh); /* calculate form factors */ isign = 0; cpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay,affp, &we,nx,ny,nxeh,nye,nxh,nyh); /* initialize electrons */ cdistr2(part,vtx,vty,vx0,vy0,npx,npy,idimp,np,nx,ny,ipbc); /* --------------------------------------------------------------------------*/ /* ---------------------------- set up ------------------------------------- */ /* --------------------------------------------------------------------------*/ int sz_qe = nxe * nye * sizeof(float); int sz_part = idimp * np * sizeof(float); int sz_fxye = ndim*nxe*nye*sizeof(float); float* g_part = (float*)copyToGPU(part, sz_part); float* g_qe = (float*)copyToGPU(qe, sz_qe); float* g_fxye = (float*)copyToGPU(fxye, sz_fxye); float* g_wke = (float*)copyToGPU(&wke, sizeof(float)); int* mutexes = createMutexes(nxe * nye); /* --------------------------------------------------------------------------*/ if(VALIDATE) { float* t = (float*)copyFromGPU(g_part, sz_part); float* t2 = (float*)copyFromGPU(g_qe, sz_qe); if(floatArrayCompare(t, part, sz_part / sizeof(float), "copy", "orig", 0) != 0 || floatArrayCompare(t2, qe, sz_qe / sizeof(float), "copy", "orig", 0) !=0) { printf("Copying to and from GPU failed validation.\n"); exit(1); } free(t); } /* * * * start main iteration loop * * * */ L500: if (nloop <= ntime) goto L2000; /* printf("ntime = %i\n",ntime); */ /* deposit charge with standard procedure: updates qe */ TS; cgpost2l_cuda(g_part, g_qe, qme, np, idimp, nxe, nye, npx, npy, mutexes); TE(tdpost); for (j = 0; j < nxe*nye; j++) qe[j] = 0.0; cgpost2l(part,qe,qme,np,idimp,nxe,nye); if(VALIDATE) { float* t = (float*)copyFromGPU(g_qe, sz_qe); if(floatArrayCompare(t, qe, sz_qe / sizeof(float), "gpu", "cpu", 1e-4) != 0) { printf("cgpost2l failed validation, ntime=%d\n", ntime); exit(1); } free(t); } /* add guard cells with standard procedure: updates qe */ TS; caguard2l_cuda(g_qe,nx,ny,nxe,nye); TE(tguard); caguard2l(qe,nx,ny,nxe,nye); if(VALIDATE) { float* t = (float*)copyFromGPU(g_qe, sz_qe); if(floatArrayCompare(t, qe, sz_qe / sizeof(float), "gpu", "cpu", 1e-4) != 0) { printf("caguard2l failed validation, ntime=%d\n", ntime); exit(1); } free(t); } /* transform charge to fourier space with standard procedure: updates qe */ TS; copyFromGPU2(qe, g_qe, sz_qe); isign = -1; cwfft2rx((float complex *)qe,isign,mixup,sct,indx,indy,nxeh,nye, nxhy,nxyh); copyToGPU2(g_qe, qe, sz_qe); TE(tfft); /* calculate force/charge in fourier space with standard procedure: */ /* updates fxye */ TS; isign = -1; cpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay, affp,&we,nx,ny,nxeh,nye,nxh,nyh); TE(tfield); /* transform force to real space with standard procedure: updates fxye */ TS; isign = 1; cwfft2r2((float complex *)fxye,isign,mixup,sct,indx,indy,nxeh,nye, nxhy,nxyh); TE(tfft); /* copy guard cells with standard procedure: updates fxye */ TS; ccguard2l(fxye,nx,ny,nxe,nye); TE(tguard); /* push particles with standard precision: updates part, wke */ TS; copyToGPU2(g_fxye, fxye, sz_fxye); cgpush2l_cuda(g_part,g_fxye,qbme,dt,g_wke,idimp,np,nx,ny,nxe,nye,ipbc,npx,npy,mutexes); TE(tpush); wke = 0.0; cgpush2l(part,fxye,qbme,dt,&wke,idimp,np,nx,ny,nxe,nye,ipbc); if(VALIDATE) { float* t = (float*)copyFromGPU(g_part, sz_part); if(floatArrayCompare(t, part, sz_part / sizeof(float), "gpu", "cpu", 1e-4) != 0) { printf("cgpush2l failed sdfsdf validation, ntime=%d\n", ntime); exit(1); } free(t); } /* sort particles by cell for standard code */ if (sortime > 0) { if (ntime%sortime==0) { TS; cdsortp2yl(part,part2,npicy,idimp,np,ny1); /* exchange pointers */ tpart = part; part = part2; part2 = tpart; copyToGPU2(g_part, part, sz_part); TE(tsort); } } if (ntime==0) { printf("Initial Field, Kinetic and Total Energies:\n"); printf("%e %e %e\n",we,wke,wke+we); } ntime += 1; goto L500; L2000: /* * * * end main iteration loop * * * */ printf("ntime = %i\n",ntime); printf("Final Field, Kinetic and Total Energies:\n"); printf("%e %e %e\n",we,wke,wke+we); printf("\n"); printf("deposit time = %f\n",tdpost); printf("guard time = %f\n",tguard); printf("solver time = %f\n",tfield); printf("fft time = %f\n",tfft); printf("push time = %f\n",tpush); printf("sort time = %f\n",tsort); tfield += tguard + tfft; printf("total solver time = %f\n",tfield); time = tdpost + tpush + tsort; printf("total particle time = %f\n",time); wt = time + tfield; printf("total time = %f\n",wt); printf("\n"); wt = 1.0e+09/(((float) nloop)*((float) np)); printf("Push Time (nsec) = %f\n",tpush*wt); printf("Deposit Time (nsec) = %f\n",tdpost*wt); printf("Sort Time (nsec) = %f\n",tsort*wt); printf("Total Particle Time (nsec) = %f\n",time*wt); freeOnGPU(g_part); freeOnGPU(g_qe); freeOnGPU(g_fxye); return 0; }