Exemple #1
0
int main(int argc, char *argv[]) {
   int indx =   9, indy =   9;
   int npx =  3072, npy =   3072;
   int ndim = 2;
   float tend = 10.0, dt = 0.1, qme = -1.0;
   float vtx = 1.0, vty = 1.0, vx0 = 0.0, vy0 = 0.0;
   float ax = .912871, ay = .912871;
/* idimp = dimension of phase space = 4 */
   int idimp = 4, ipbc = 1;
   float wke = 0.0, we = 0.0, wt = 0.0;
/* sorting tiles, should be less than or equal to 32 */
   int mx = 16, my = 16;
/* fraction of extra particles needed for particle management */
   float xtras = 0.2;
/* declare scalars for standard code */
   int j;
   int np, nx, ny, nxh, nyh, nxe, nye, nxeh, nxyh, nxhy;
   int mx1, my1, mxy1, ntime, nloop, isign;
   float qbme, affp;

/* declare scalars for OpenMP code */
   int nppmx, nppmx0, ntmax, npbmx, irc;
   int nvp;

/* declare arrays for standard code */
   float *part = NULL;
   float *qe = NULL;
   float *fxye = NULL;
   float complex *ffc = NULL;
   int *mixup = NULL;
   float complex *sct = NULL;

/* declare arrays for OpenMP (tiled) code */
   float *ppart = NULL, *ppbuff = NULL;
   int *kpic = NULL;
   int *ncl = NULL;
   int *ihole = NULL;

/* declare and initialize timing data */
   float time;
   struct timeval itime;
   float tdpost = 0.0, tguard = 0.0, tfft = 0.0, tfield = 0.0;
   float tpush = 0.0, tsort = 0.0;
   double dtime;

   irc = 0;
/* nvp = number of shared memory nodes  (0=default) */
   nvp = 0;
/* printf("enter number of nodes:\n"); */
/* scanf("%i",&nvp);                   */
/* initialize for shared memory parallel processing */
   cinit_omp(nvp);

/* initialize scalars for standard code */
   np = npx*npy; nx = 1L<<indx; ny = 1L<<indy; nxh = nx/2; nyh = ny/2;
   nxe = nx + 2; nye = ny + 1; nxeh = nxe/2;
   nxyh = (nx > ny ? nx : ny)/2; nxhy = nxh > ny ? nxh : ny;
   mx1 = (nx - 1)/mx + 1; my1 = (ny - 1)/my + 1; mxy1 = mx1*my1;
   nloop = tend/dt + .0001; ntime = 0;
   qbme = qme;
   affp = (float) (nx*ny)/(float ) np;

/* allocate and initialize data for standard code */
   part = (float *) malloc(idimp*np*sizeof(float));
   qe = (float *) malloc(nxe*nye*sizeof(float));
   fxye = (float *) malloc(ndim*nxe*nye*sizeof(float));
   ffc = (float complex *) malloc(nxh*nyh*sizeof(float complex));
   mixup = (int *) malloc(nxhy*sizeof(int));
   sct = (float complex *) malloc(nxyh*sizeof(float complex));
   kpic = (int *) malloc(mxy1*sizeof(int));

/* prepare fft tables */
   cwfft2rinit(mixup,sct,indx,indy,nxhy,nxyh);
/* calculate form factors */
   isign = 0;
   cmpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay,
             affp,&we,nx,ny,nxeh,nye,nxh,nyh);
/* initialize electrons */
   cdistr2(part,vtx,vty,vx0,vy0,npx,npy,idimp,np,nx,ny,ipbc);

/* find number of particles in each of mx, my tiles: updates kpic, nppmx */
   cdblkp2l(part,kpic,&nppmx,idimp,np,mx,my,mx1,mxy1,&irc);
   if (irc != 0) { 
      printf("cdblkp2l error, irc=%d\n",irc);
      exit(1);
   }
/* allocate vector particle data */
   nppmx0 = (1.0 + xtras)*nppmx;
   ntmax = xtras*nppmx;
   npbmx = xtras*nppmx;
   ppart = (float *) malloc(idimp*nppmx0*mxy1*sizeof(float));
   ppbuff = (float *) malloc(idimp*npbmx*mxy1*sizeof(float));
   ncl = (int *) malloc(8*mxy1*sizeof(int));
   ihole = (int *) malloc(2*(ntmax+1)*mxy1*sizeof(int));
/* copy ordered particle data for OpenMP: updates ppart and kpic */
   cppmovin2l(part,ppart,kpic,nppmx0,idimp,np,mx,my,mx1,mxy1,&irc);
   if (irc != 0) { 
      printf("cppmovin2l overflow error, irc=%d\n",irc);
      exit(1);
   }
/* sanity check */
   cppcheck2l(ppart,kpic,idimp,nppmx0,nx,ny,mx,my,mx1,my1,&irc);
   if (irc != 0) {
      printf("%d,cppcheck2l error: irc=%d\n",ntime,irc);
      exit(1);
   }

/* * * * start main iteration loop * * * */

L500: if (nloop <= ntime)
         goto L2000;
/*    printf("ntime = %i\n",ntime); */

/* deposit charge with OpenMP: updates qe */
      dtimer(&dtime,&itime,-1);
      for (j = 0; j < nxe*nye; j++) {
         qe[j] = 0.0;
      }
      cgppost2l(ppart,qe,kpic,qme,nppmx0,idimp,mx,my,nxe,nye,mx1,mxy1);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tdpost += time;

/* add guard cells with OpenMP: updates qe */
      dtimer(&dtime,&itime,-1);
      caguard2l(qe,nx,ny,nxe,nye);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tguard += time;

/* transform charge to fourier space with OpenMP: updates qe */
      dtimer(&dtime,&itime,-1);
      isign = -1;
      cwfft2rmx((float complex *)qe,isign,mixup,sct,indx,indy,nxeh,
                nye,nxhy,nxyh);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tfft += time;

/* calculate force/charge in fourier space with OpenMP: updates fxye, we */
      dtimer(&dtime,&itime,-1);
      isign = -1;
      cmpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,
               ay,affp,&we,nx,ny,nxeh,nye,nxh,nyh);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tfield += time;

/* transform force to real space with OpenMP: updates fxye */
      dtimer(&dtime,&itime,-1);
      isign = 1;
      cwfft2rm2((float complex *)fxye,isign,mixup,sct,indx,indy,nxeh,
                nye,nxhy,nxyh);

      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tfft += time;

/* copy guard cells with OpenMP: updates fxye */
      dtimer(&dtime,&itime,-1);
      ccguard2l(fxye,nx,ny,nxe,nye);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tguard += time;

/* push particles with OpenMP: */
      wke = 0.0;
      dtimer(&dtime,&itime,-1);
/* updates ppart, wke */
/*    cgppush2l(ppart,fxye,kpic,qbme,dt,&wke,idimp,nppmx0,nx,ny,mx,my, */
/*              nxe,nye,mx1,mxy1,ipbc);                                */
/* updates ppart, ncl, ihole, wke, irc */
      cgppushf2l(ppart,fxye,kpic,ncl,ihole,qbme,dt,&wke,idimp,nppmx0,
                 nx,ny,mx,my,nxe,nye,mx1,mxy1,ntmax,&irc);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tpush += time;
      if (irc != 0) {
         printf("cgppushf2l error: irc=%d\n",irc);
         exit(1);
      }

/* reorder particles by tile with OpenMP: */
      dtimer(&dtime,&itime,-1);
/* updates ppart, ppbuff, kpic, ncl, ihole, and irc */
/*    cpporder2l(ppart,ppbuff,kpic,ncl,ihole,idimp,nppmx0,nx,ny,mx,my, */
/*               mx1,my1,npbmx,ntmax,&irc);                            */
/* updates ppart, ppbuff, kpic, ncl, and irc */
      cpporderf2l(ppart,ppbuff,kpic,ncl,ihole,idimp,nppmx0,mx1,my1,
                  npbmx,ntmax,&irc);
      dtimer(&dtime,&itime,1);
      time = (float) dtime;
      tsort += time;
      if (irc != 0) {
         printf("cpporderf2l error: ntmax, irc=%d,%d\n",ntmax,irc);
         exit(1);
      }

      if (ntime==0) {
         printf("Initial Field, Kinetic and Total Energies:\n");
         printf("%e %e %e\n",we,wke,wke+we);
      }
      ntime += 1;
      goto L500;
L2000:

/* * * * end main iteration loop * * * */

   printf("ntime = %i\n",ntime);
   printf("Final Field, Kinetic and Total Energies:\n");
   printf("%e %e %e\n",we,wke,wke+we);

   printf("\n");
   printf("deposit time = %f\n",tdpost);
   printf("guard time = %f\n",tguard);
   printf("solver time = %f\n",tfield);
   printf("fft time = %f\n",tfft);
   printf("push time = %f\n",tpush);
   printf("sort time = %f\n",tsort);
   tfield += tguard + tfft;
   printf("total solver time = %f\n",tfield);
   time = tdpost + tpush + tsort;
   printf("total particle time = %f\n",time);
   wt = time + tfield;
   printf("total time = %f\n",wt);
   printf("\n");

   wt = 1.0e+09/(((float) nloop)*((float) np));
   printf("Push Time (nsec) = %f\n",tpush*wt);
   printf("Deposit Time (nsec) = %f\n",tdpost*wt);
   printf("Sort Time (nsec) = %f\n",tsort*wt);
   printf("Total Particle Time (nsec) = %f\n",time*wt);
   printf("\n");

   return 0;
}
Exemple #2
0
int main(int argc, char *argv[]) 
{
    int indx = 6, indy = 7, npx = 48, npy = 12;
    float tend = 65.0f, dt = 0.1f;
    
    // parse in parameters
    if(argc == 7)
    {
        indx = atoi(argv[1]);
        indy = atoi(argv[2]);
        npx = atoi(argv[3]);
        npy = atoi(argv[4]);
        tend = atof(argv[5]);
        dt = atof(argv[6]);
        
        if(!(indx && indy && npx && npy && tend && dt))
        {
            printf("One or more parameters are invalid.\n");
            exit(1);
        }
    }
    else if(argc != 1)
    {
        printf("Usage: %s indx indy npx npy tend dt\n", argv[0]);
        exit(1);
    }
    else
    {
        printf("Using default parameters...\n");
    }
    
    int ndim = 2; 
    float qme = -1.0;
    float vtx = 1.0, vty = 1.0, vx0 = 0.0, vy0 = 0.0;
    float ax = .912871, ay = .912871;
    /* idimp = dimension of phase space = 4 */
    /* sortime = number of time steps between standard electron sorting */
    int idimp = 4, ipbc = 1, sortime = 50;
    float wke = 0.0, we = 0.0, wt = 0.0;
    /* declare scalars for standard code */
    int j;
    int np, nx, ny, nxh, nyh, nxe, nye, nxeh, nxyh, nxhy;
    int ny1, ntime, nloop, isign;
    float qbme, affp;
    /* declare arrays for standard code */
    float *part = NULL, *part2 = NULL, *tpart = NULL;
    float *qe = NULL;
    float *fxye = NULL;
    float complex *ffc = NULL;
    int *mixup = NULL;
    float complex *sct = NULL;
    int *npicy = NULL;
    /* declare and initialize timing data */
    float time;
    struct timeval itime;
    float tdpost = 0.0, tguard = 0.0, tfft = 0.0, tfield = 0.0;
    float tpush = 0.0, tsort = 0.0;
    double dtime;
    
    /* initialize scalars for standard code */
    np = npx*npy; nx = 1L<<indx; ny = 1L<<indy; nxh = nx/2; nyh = ny/2;
    nxe = nx + 2; nye = ny + 1; nxeh = nxe/2;
    nxyh = (nx > ny ? nx : ny)/2; nxhy = nxh > ny ? nxh : ny;
    ny1 = ny + 1;
    nloop = tend/dt + .0001; ntime = 0;
    qbme = qme;
    affp = (float) (nx*ny)/(float ) np;
    /* allocate and initialize data for standard code */
    part = (float *) malloc(idimp*np*sizeof(float));
    part2 = (float *) malloc(idimp*np*sizeof(float));
    qe = (float *) malloc(nxe*nye*sizeof(float));
    fxye = (float *) malloc(ndim*nxe*nye*sizeof(float));
    ffc = (float complex *) malloc(nxh*nyh*sizeof(float complex));
    mixup = (int *) malloc(nxhy*sizeof(int));
    sct = (float complex *) malloc(nxyh*sizeof(float complex));
    npicy = (int *) malloc(ny1*sizeof(int));
    /* prepare fft tables */
    cwfft2rinit(mixup,sct,indx,indy,nxhy,nxyh);
    /* calculate form factors */
    isign = 0;
    cpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay,affp,
            &we,nx,ny,nxeh,nye,nxh,nyh);
    /* initialize electrons */
    cdistr2(part,vtx,vty,vx0,vy0,npx,npy,idimp,np,nx,ny,ipbc);
    
    
/* --------------------------------------------------------------------------*/
/* ---------------------------- set up ------------------------------------- */
/* --------------------------------------------------------------------------*/

    int sz_qe = nxe * nye * sizeof(float);
    int sz_part = idimp * np * sizeof(float);
    int sz_fxye = ndim*nxe*nye*sizeof(float);
    
    float* g_part = (float*)copyToGPU(part, sz_part);
    float* g_qe = (float*)copyToGPU(qe, sz_qe);
    float* g_fxye = (float*)copyToGPU(fxye, sz_fxye);
    float* g_wke = (float*)copyToGPU(&wke, sizeof(float));
    int* mutexes = createMutexes(nxe * nye);
    
/* --------------------------------------------------------------------------*/
    
    if(VALIDATE)
    {
        float* t = (float*)copyFromGPU(g_part, sz_part);
        float* t2 = (float*)copyFromGPU(g_qe, sz_qe);
        if(floatArrayCompare(t, part, sz_part / sizeof(float), "copy", "orig", 0) != 0 ||
           floatArrayCompare(t2, qe, sz_qe / sizeof(float), "copy", "orig", 0) !=0)
        {
            printf("Copying to and from GPU failed validation.\n");
            exit(1);
        }
        free(t);
    }
    
    
    
    
    /* * * * start main iteration loop * * * */
    
    L500: if (nloop <= ntime)
    goto L2000;
    /*    printf("ntime = %i\n",ntime); */
    
    
    
    
    /* deposit charge with standard procedure: updates qe */
    TS;
    cgpost2l_cuda(g_part, g_qe, qme, np, idimp, nxe, nye, npx, npy, mutexes);
    TE(tdpost);
    
    for (j = 0; j < nxe*nye; j++)
        qe[j] = 0.0;
    cgpost2l(part,qe,qme,np,idimp,nxe,nye);
    
    if(VALIDATE)
    {
        float* t = (float*)copyFromGPU(g_qe, sz_qe);
        if(floatArrayCompare(t, qe, sz_qe / sizeof(float), "gpu", "cpu", 1e-4) != 0)
        {
            printf("cgpost2l failed validation, ntime=%d\n", ntime);
            exit(1);
        }
        free(t);
    }
    
    
    
    /* add guard cells with standard procedure: updates qe */
    TS;
    caguard2l_cuda(g_qe,nx,ny,nxe,nye);
    TE(tguard);
    
    caguard2l(qe,nx,ny,nxe,nye);
    
    if(VALIDATE)
    {
        float* t = (float*)copyFromGPU(g_qe, sz_qe);
        if(floatArrayCompare(t, qe, sz_qe / sizeof(float), "gpu", "cpu", 1e-4) != 0)
        {
            printf("caguard2l failed validation, ntime=%d\n", ntime);
            exit(1);
        }
        free(t);
    }
    
    
    
    /* transform charge to fourier space with standard procedure: updates qe */
    TS;
    copyFromGPU2(qe, g_qe, sz_qe);
    isign = -1;
    cwfft2rx((float complex *)qe,isign,mixup,sct,indx,indy,nxeh,nye,
             nxhy,nxyh);
    copyToGPU2(g_qe, qe, sz_qe);
    TE(tfft);
    
    /* calculate force/charge in fourier space with standard procedure: */
    /* updates fxye                                                     */
    TS;
    isign = -1;
    cpois22((float complex *)qe,(float complex *)fxye,isign,ffc,ax,ay,
            affp,&we,nx,ny,nxeh,nye,nxh,nyh);
    TE(tfield);
    
    
    
    /* transform force to real space with standard procedure: updates fxye */
    TS;
    isign = 1;
    cwfft2r2((float complex *)fxye,isign,mixup,sct,indx,indy,nxeh,nye,
             nxhy,nxyh);
    TE(tfft);
    
    
    
    /* copy guard cells with standard procedure: updates fxye */
    TS;
    ccguard2l(fxye,nx,ny,nxe,nye);
    TE(tguard);
    
    
    
    /* push particles with standard precision: updates part, wke */
    TS;
    copyToGPU2(g_fxye, fxye, sz_fxye);
    cgpush2l_cuda(g_part,g_fxye,qbme,dt,g_wke,idimp,np,nx,ny,nxe,nye,ipbc,npx,npy,mutexes);
    TE(tpush);
    
    wke = 0.0;
    cgpush2l(part,fxye,qbme,dt,&wke,idimp,np,nx,ny,nxe,nye,ipbc);
    
    if(VALIDATE)
    {
        float* t = (float*)copyFromGPU(g_part, sz_part);
        if(floatArrayCompare(t, part, sz_part / sizeof(float), "gpu", "cpu", 1e-4) != 0)
        {
            printf("cgpush2l failed sdfsdf validation, ntime=%d\n", ntime);
            exit(1);
        }
        free(t);
    }
    
    /* sort particles by cell for standard code */
    if (sortime > 0) {
        if (ntime%sortime==0) {
            TS;
            cdsortp2yl(part,part2,npicy,idimp,np,ny1);
            /* exchange pointers */
            tpart = part;
            part = part2;
            part2 = tpart;
            copyToGPU2(g_part, part, sz_part);
            TE(tsort);
        }
    }
    
    
    
    if (ntime==0) {
        printf("Initial Field, Kinetic and Total Energies:\n");
        printf("%e %e %e\n",we,wke,wke+we);
    }
    ntime += 1;
    goto L500;
    L2000:
    
    /* * * * end main iteration loop * * * */
    
    printf("ntime = %i\n",ntime);
    printf("Final Field, Kinetic and Total Energies:\n");
    printf("%e %e %e\n",we,wke,wke+we);
    printf("\n");
    printf("deposit time = %f\n",tdpost);
    printf("guard time = %f\n",tguard);
    printf("solver time = %f\n",tfield);
    printf("fft time = %f\n",tfft);
    printf("push time = %f\n",tpush);
    printf("sort time = %f\n",tsort);
    tfield += tguard + tfft;
    printf("total solver time = %f\n",tfield);
    time = tdpost + tpush + tsort;
    printf("total particle time = %f\n",time);
    wt = time + tfield;
    printf("total time = %f\n",wt);
    printf("\n");
    wt = 1.0e+09/(((float) nloop)*((float) np));
    printf("Push Time (nsec) = %f\n",tpush*wt);
    printf("Deposit Time (nsec) = %f\n",tdpost*wt);
    printf("Sort Time (nsec) = %f\n",tsort*wt);
    printf("Total Particle Time (nsec) = %f\n",time*wt);
    
    freeOnGPU(g_part);
    freeOnGPU(g_qe);
    freeOnGPU(g_fxye);
    return 0;
}