コード例 #1
0
ファイル: gfft.c プロジェクト: adamdempsey90/Snoopy
void init_gfft() {
	int n_size2D_ZXY[] = {NX,NY};
	
	
	
#ifdef _OPENMP
	fftw_plan_with_nthreads( nthreads );
#endif
	
	r2cfft_mpi_t = fftw_mpi_plan_dft_r2c_3d( NY, NX, NZ, wr1, w1,  MPI_COMM_WORLD, FFT_PLANNING | FFTW_MPI_TRANSPOSED_OUT);	
	if (r2cfft_mpi_t == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW R2C_T plan creation failed");
	
	r2cfft_mpi = fftw_mpi_plan_dft_r2c_3d( NX, NY, NZ, wr1, w1,  MPI_COMM_WORLD, FFT_PLANNING);
	if (r2cfft_mpi == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW R2C plan creation failed");

	c2rfft_mpi_t = fftw_mpi_plan_dft_c2r_3d( NY, NX, NZ, w1, wr1,  MPI_COMM_WORLD, FFT_PLANNING | FFTW_MPI_TRANSPOSED_IN);
	if (c2rfft_mpi_t == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW C2R_T plan creation failed");
	
	c2rfft_mpi = fftw_mpi_plan_dft_c2r_3d( NX, NY, NZ, w1, wr1,  MPI_COMM_WORLD, FFT_PLANNING);
	if (c2rfft_mpi == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW C2R plan creation failed");
	
	r2cfft_2Dslice = fftw_plan_dft_r2c_2d(NX,NY,wrh3,wh3,FFT_PLANNING);
	if (r2cfft_2Dslice == NULL) ERROR_HANDLER( ERROR_CRITICAL, "FFTW r2c slice plan creation failed");


	// init transpose routines (These are used by remap routines)
	init_transpose();
	
	fft_timer=0.0;
	
	
	return;
}
コード例 #2
0
ファイル: ex146.c プロジェクト: tom-klotz/petsc
PetscInt main(PetscInt argc,char **args)
{
  ptrdiff_t      N0=256,N1=256,N2=256,N3=2,dim[4];
  fftw_plan      bplan,fplan;
  fftw_complex   *out;
  double         *in1,*in2;
  ptrdiff_t      alloc_local,local_n0,local_0_start;
  ptrdiff_t      local_n1,local_1_start;
  PetscInt       i,j,indx,n1;
  PetscInt       size,rank,n,N,*in,N_factor,NM;
  PetscScalar    *data_fin,value1,one=1.57,zero=0.0;
  PetscScalar    a,*x_arr,*y_arr,*z_arr,enorm;
  Vec            fin,fout,fout1,ini,final;
  PetscRandom    rnd;
  PetscErrorCode ierr;
  VecScatter     vecscat,vecscat1;
  IS             indx1,indx2;
  PetscInt       *indx3,k,l,*indx4;
  PetscInt       low,tempindx,tempindx1;


  ierr = PetscInitialize(&argc,&args,(char*)0,help);if (ierr) return ierr;
#if defined(PETSC_USE_COMPLEX)
  SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP, "This example requires real numbers. Your current scalar type is complex");
#endif
  ierr = MPI_Comm_size(PETSC_COMM_WORLD, &size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);

  PetscRandomCreate(PETSC_COMM_WORLD,&rnd);


  alloc_local = fftw_mpi_local_size_3d_transposed(N0,N1,N2/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);

/*    printf("The value alloc_local is %ld from process %d\n",alloc_local,rank);     */
  printf("The value local_n0 is %ld from process %d\n",local_n0,rank);
/*    printf("The value local_0_start is  %ld from process %d\n",local_0_start,rank);*/
/*    printf("The value local_n1 is  %ld from process %d\n",local_n1,rank);          */
/*    printf("The value local_1_start is  %ld from process %d\n",local_1_start,rank);*/

  /* Allocate space for input and output arrays  */

  in1=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
  in2=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
  out=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);


  N=2*N0*N1*(N2/2+1);N_factor=N0*N1*N2;
  n=2*local_n0*N1*(N2/2+1);n1=local_n1*N0*2*N1;

/*    printf("The value N is  %d from process %d\n",N,rank);   */
/*    printf("The value n is  %d from process %d\n",n,rank);   */
/*    printf("The value n1 is  %d from process %d\n",n1,rank); */
  /* Creating data vector and accompanying array with VeccreateMPIWithArray */
  ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,1,n,N,(PetscScalar*)in1,&fin);CHKERRQ(ierr);
  ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,1,n,N,(PetscScalar*)out,&fout);CHKERRQ(ierr);
  ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,1,n,N,(PetscScalar*)in2,&fout1);CHKERRQ(ierr);

/*    VecGetSize(fin,&size); */
/*    printf("The size is %d\n",size); */

  VecSet(fin,one);
  VecSet(fout,zero);
  VecSet(fout1,zero);

  VecAssemblyBegin(fin);
  VecAssemblyEnd(fin);
/*    VecView(fin,PETSC_VIEWER_STDOUT_WORLD); */


  VecGetArray(fin,&x_arr);
  VecGetArray(fout1,&z_arr);
  VecGetArray(fout,&y_arr);

  fplan=fftw_mpi_plan_dft_r2c_3d(N0,N1,N2,(double*)x_arr,(fftw_complex*)y_arr,PETSC_COMM_WORLD,FFTW_ESTIMATE);
  bplan=fftw_mpi_plan_dft_c2r_3d(N0,N1,N2,(fftw_complex*)y_arr,(double*)z_arr,PETSC_COMM_WORLD,FFTW_ESTIMATE);

  fftw_execute(fplan);
  fftw_execute(bplan);

  VecRestoreArray(fin,&x_arr);
  VecRestoreArray(fout1,&z_arr);
  VecRestoreArray(fout,&y_arr);


/*    a = 1.0/(PetscReal)N_factor; */
/*    ierr = VecScale(fout1,a);CHKERRQ(ierr); */
  VecCreate(PETSC_COMM_WORLD,&ini);
  VecCreate(PETSC_COMM_WORLD,&final);
  VecSetSizes(ini,local_n0*N1*N2,N_factor);
  VecSetSizes(final,local_n0*N1*N2,N_factor);
/*    VecSetSizes(ini,PETSC_DECIDE,N_factor); */
/*    VecSetSizes(final,PETSC_DECIDE,N_factor); */
  VecSetFromOptions(ini);
  VecSetFromOptions(final);

  if (N2%2==0) NM=N2+2;
  else NM=N2+1;

  ierr = VecGetOwnershipRange(fin,&low,NULL);
  printf("The local index is %d from %d\n",low,rank);
  ierr = PetscMalloc1(local_n0*N1*N2,&indx3);
  ierr = PetscMalloc1(local_n0*N1*N2,&indx4);
  for (i=0; i<local_n0; i++) {
    for (j=0;j<N1;j++) {
      for (k=0;k<N2;k++) {
        tempindx  = i*N1*N2 + j*N2 + k;
        tempindx1 = i*N1*NM + j*NM + k;

        indx3[tempindx]=local_0_start*N1*N2+tempindx;
        indx4[tempindx]=low+tempindx1;
      }
      /*          printf("index3 %d from proc %d is \n",indx3[tempindx],rank); */
      /*          printf("index4 %d from proc %d is \n",indx4[tempindx],rank); */
    }
  }
  VecGetValues(fin,local_n0*N1*N2,indx4,x_arr);
  VecSetValues(ini,local_n0*N1*N2,indx3,x_arr,INSERT_VALUES);
  VecAssemblyBegin(ini);
  VecAssemblyEnd(ini);

  VecGetValues(fout1,local_n0*N1*N2,indx4,y_arr);
  VecSetValues(final,local_n0*N1*N2,indx3,y_arr,INSERT_VALUES);
  VecAssemblyBegin(final);
  VecAssemblyEnd(final);

  printf("The local index value is %ld from %d",local_n0*N1*N2,rank);
/*
  for (i=0;i<N0;i++) {
     for (j=0;j<N1;j++) {
        indx=i*N1*NM+j*NM;
        ISCreateStride(PETSC_COMM_WORLD,N2,indx,1,&indx1);
        indx=i*N1*N2+j*N2;
        ISCreateStride(PETSC_COMM_WORLD,N2,indx,1,&indx2);
        VecScatterCreate(fin,indx1,ini,indx2,&vecscat);
        VecScatterBegin(vecscat,fin,ini,INSERT_VALUES,SCATTER_FORWARD);
        VecScatterEnd(vecscat,fin,ini,INSERT_VALUES,SCATTER_FORWARD);
        VecScatterCreate(fout1,indx1,final,indx2,&vecscat1);
        VecScatterBegin(vecscat1,fout1,final,INSERT_VALUES,SCATTER_FORWARD);
        VecScatterEnd(vecscat1,fout1,final,INSERT_VALUES,SCATTER_FORWARD);
     }
  }
*/
  a    = 1.0/(PetscReal)N_factor;
  ierr = VecScale(fout1,a);CHKERRQ(ierr);
  ierr = VecScale(final,a);CHKERRQ(ierr);

  VecAssemblyBegin(ini);
  VecAssemblyEnd(ini);

  VecAssemblyBegin(final);
  VecAssemblyEnd(final);

/*    VecView(final,PETSC_VIEWER_STDOUT_WORLD); */
  ierr = VecAXPY(final,-1.0,ini);CHKERRQ(ierr);
  ierr = VecNorm(final,NORM_1,&enorm);CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_WORLD,"  Error norm of |x - z|  = %e\n",enorm);CHKERRQ(ierr);
  fftw_destroy_plan(fplan);
  fftw_destroy_plan(bplan);
  fftw_free(in1); ierr = VecDestroy(&fin);CHKERRQ(ierr);
  fftw_free(out); ierr = VecDestroy(&fout);CHKERRQ(ierr);
  fftw_free(in2); ierr = VecDestroy(&fout1);CHKERRQ(ierr);

  ierr = PetscFinalize();
  return ierr;
}
コード例 #3
0
ファイル: auxPM.c プロジェクト: CullanHowlett/PICOLA_DEV
// A master routine called from main.c to calculate the acceleration
// =================================================================
void GetDisplacements(void) {

  int j;
#ifdef TIMING
  double startcpu, endcpu;
  double startwall, endwall;
#endif

  // First we check whether all the particles are on the correct processor after the last time step/
  // original 2LPT displacement and move them if not
  if (ThisTask == 0) printf("Moving particles across task boundaries...\n");
#ifdef TIMING
  startcpu = (double)clock();
  startwall = MPI_Wtime();
#endif
  MoveParticles();
#ifdef TIMING
  endcpu = (double)clock();
  endwall = MPI_Wtime();
  CpuTime_Move[timeSteptot-1] = (endcpu-startcpu)/(double)CLOCKS_PER_SEC;  
  WallTime_Move[timeSteptot-1] = endwall-startwall;
#endif

#ifdef MEMORY_MODE
  density = (float_kind *)malloc(2*Total_size*sizeof(float_kind));
  P3D  = (complex_kind*)density;
#ifdef SINGLE_PRECISION
  plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
  plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

  // Then we do the Cloud-in-Cell assignment to get the density grid and FFT it.  
  if (ThisTask == 0) printf("Calculating density using Cloud-in-Cell...\n");
#ifdef TIMING
  startcpu = (double)clock();
  startwall = MPI_Wtime();
#endif
  PtoMesh();
#ifdef TIMING
  endcpu = (double)clock();
  endwall = MPI_Wtime();
  CpuTime_PtoMesh[timeSteptot-1] = (endcpu-startcpu)/(double)CLOCKS_PER_SEC;  
  WallTime_PtoMesh[timeSteptot-1] = endwall-startwall;
#endif

#ifdef MEMORY_MODE
  N11  = (float_kind *)malloc(2*Total_size*sizeof(float_kind));
  N12  = (float_kind *)malloc(2*Total_size*sizeof(float_kind));
  N13  = (float_kind *)malloc(2*Total_size*sizeof(float_kind));
  FN11 = (complex_kind*)N11;
  FN12 = (complex_kind*)N12;
  FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
  p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
  p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif
    
  // This returns N11,N12,N13 which hold the components of
  // the vector (grad grad^{-2} density) on a grid.
  if (ThisTask == 0) printf("Calculating forces...\n");
#ifdef TIMING
  startcpu = (double)clock();
  startwall = MPI_Wtime();
#endif
  Forces();
#ifdef TIMING
  endcpu = (double)clock();
  endwall = MPI_Wtime();
  CpuTime_Forces[timeSteptot-1] = (endcpu-startcpu)/(double)CLOCKS_PER_SEC;  
  WallTime_Forces[timeSteptot-1] = endwall-startwall;
#endif 

#ifdef MEMORY_MODE
  free(density);
  for (j=0; j<3; j++) Disp[j] = (float *)malloc(NumPart*sizeof(float));
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(plan);
#else 
  fftw_destroy_plan(plan);
#endif
#else
  for (j=0; j<3; j++) Disp[j] = (float_kind *)malloc(NumPart*sizeof(float_kind));
#endif
    
  // Now find the accelerations at the particle positions using 3-linear interpolation. 
  if (ThisTask == 0) printf("Calculating accelerations...\n");
#ifdef TIMING
  startcpu = (double)clock();
  startwall = MPI_Wtime();
#endif
  MtoParticles();
#ifdef TIMING
  endcpu = (double)clock();
  endwall = MPI_Wtime();
  CpuTime_MtoParticles[timeSteptot-1] = (endcpu-startcpu)/(double)CLOCKS_PER_SEC;  
  WallTime_MtoParticles[timeSteptot-1] = endwall-startwall;
#endif

#ifdef MEMORY_MODE
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif
}
コード例 #4
0
ファイル: main.c プロジェクト: CullanHowlett/PICOLA_DEV
int main(int argc, char **argv) {
   
  // Set up MPI
  // ==========
  ierr = MPI_Init(&argc, &argv);
  ierr = MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
  ierr = MPI_Comm_size(MPI_COMM_WORLD, &NTask);
#ifdef SINGLE_PRECISION
  fftwf_mpi_init();
#else
  fftw_mpi_init();
#endif

  if(argc < 2) {
    if(ThisTask == 0) {
      fprintf(stdout, "Input parameters not found\n");
      fprintf(stdout, "Call with <ParameterFile>\n");
    }
    ierr = MPI_Finalize();
    exit(0);
  }
   
  // Read the run parameters and setup code
  // ======================================
  int stepDistr;   
  int subtractLPT;
  double da=0;

  read_parameterfile(argv[1]);
  
  if (UseCOLA == 1){
    subtractLPT = 1; 
    stepDistr   = 0;
    StdDA       = 0;
  } else{
    subtractLPT = 0; 
    stepDistr   = 1;
    StdDA       = 2;
  }
  if (StdDA == 0){
    fullT = 1;
    nLPT  = -2.5;
  }
  filter = 0;              // Whether or not to smooth the forces
  Scale  = 2.*M_PI/Box;    // The force smoothing scale 

  if(ThisTask == 0) {
    printf("Run Parameters\n");
    printf("==============\n");
    printf("Cosmology:\n");
    printf("  Omega Matter(z=0) = %lf\n",Omega);
    printf("  Omega Baryon(z=0) = %lf\n",OmegaBaryon);
    printf("  Hubble Parameter(z=0) = %lf\n",HubbleParam);
    printf("  Sigma8(z=0) = %lf\n",Sigma8);
#ifndef GAUSSIAN
    printf("  F_nl = %lf\n",Fnl);
#endif
    printf("  Primordial Index = %lf\n",PrimordialIndex);
    printf("  Initial Redshift  = %lf\n",Init_Redshift);
    printf("  Final Redshift    = %lf\n",Final_Redshift);
#ifndef GAUSSIAN
    printf("  F_nl Redshift  = %lf\n",Fnl_Redshift);
#endif
    printf("Simulation:\n");
    printf("  Nmesh = %d\n", Nmesh);
    printf("  Nsample = %d\n", Nsample);
    printf("  Boxsize = %lf\n", Box);
    printf("  Buffer Size = %lf\n", Buffer);
    switch(WhichSpectrum) {
      case 0:
        switch (WhichTransfer) {
          case 1:
            printf("  Using Eisenstein & Hu Transfer Function\n");
            break;
          case 2:
            printf("  Using Tabulated Transfer Function\n");
            break;
          default:
            printf("  Using Efstathiou Transfer Function\n");
            break;
        }
        break;
      case 1:
        printf("  Using Eisenstein & Hu Power Spectrum\n");
        break;
      case 2:
        printf("  Using Tabulated Power Spectrum\n");
        break;   
      default:
        printf("  Using Efstathiou Power Spectrum\n");
        break;
    }      
    printf("  Number of Timesteps = %d\n",nsteps);
    if (UseCOLA) {
      printf("  Using COLA method\n\n");
    } else {
      printf("  Using Standard PM method\n\n");
    }
    fflush(stdout);
  }   
  
  // Initial and final scale factors:
  double ai=1.0/(1.0+Init_Redshift);
  double af=1.0/(1.0+Final_Redshift);
    
  if (stepDistr == 0) da=(af-ai)/((double)nsteps);
  if (stepDistr == 1) da=(log(af)-log(ai))/((double)nsteps);
  if (stepDistr == 2) da=(CosmoTime(af)-CosmoTime(ai))/((double)nsteps);

  set_units();

  if (ThisTask == 0) {
    printf("Initialising Transfer Function/Power Spectrum\n");
    printf("=============================================\n");
  }
  initialize_transferfunction();
  initialize_powerspectrum();
  initialize_ffts();
  initialize_parts();

  if(ThisTask == 0) {
    printf("Creating initial conditions\n");
    printf("===========================\n");
    fflush(stdout);
  }

  // Create the calculate the Zeldovich and 2LPT displacements and create the initial conditions
  // ===========================================================================================
  int i, j, k, m;
  unsigned int n, coord;
  double A=ai;                // This is the scale factor which we'll be advancing below.
  double Di=growthD(1.0, A);  // initial growth factor
  double Di2=growthD2(A);     // initial 2nd order growth factor  
  double Dv=DprimeQ(A,1.0);   // T[D_{za}]=dD_{za}/dy
  double Dv2=growthD2v(A);    // T[D_{2lpt}]=dD_{2lpt}/dy

  displacement_fields();
    
  P = (struct part_data *) malloc((int)(ceil(NumPart*Buffer))*sizeof(struct part_data));

  // Generate the initial particle positions and velocities
  // If subtractLPT = 0 (non-COLA), then velocity is ds/dy, which is simply the 2LPT IC.
  // Else set vel = 0 if we subtract LPT. This is the same as the action of the operator L_- from TZE, as initial velocities are in 2LPT.
  for(i=0; i<Local_np; i++) {
    for (j=0; j<Nsample; j++) {
      for (k=0; k<Nsample; k++) {
        coord = (i * Nsample + j) * Nsample + k;
           
        P[coord].ID = ((i + Local_p_start) * Nsample + j) * Nsample + k;
        for (m=0; m<3; m++) {
          P[coord].Dz[m] = ZA[m][coord];
          P[coord].D2[m] = LPT[m][coord];
          if (subtractLPT == 0) {
            P[coord].Vel[m]=P[coord].Dz[m]*Dv+P[coord].D2[m]*Dv2;
          } else {
            P[coord].Vel[m] = 0.0;
          }
        }

        P[coord].Pos[0] = periodic_wrap((i+Local_p_start)*(Box/Nsample)+P[coord].Dz[0]*Di+P[coord].D2[0]*Di2);
        P[coord].Pos[1] = periodic_wrap(j*(Box/Nsample)+P[coord].Dz[1]*Di+P[coord].D2[1]*Di2);
        P[coord].Pos[2] = periodic_wrap(k*(Box/Nsample)+P[coord].Dz[2]*Di+P[coord].D2[2]*Di2);
      }
    }
  }

  for (i=0; i<3; i++) {
    free(ZA[i]);
    free(LPT[i]);
  }

  // Now, we get to the N-Body part where we evolve with time via the Kick-Drift-Kick Method
  // =======================================================================================
  int timeStep;
  double AF=0,AI,AC,AFF=0;
  double growth1   = Di;
  double growth1L2 = Di2;

  // The density grid and force grids  and associated fftw plans
#ifndef MEMORY_MODE
  density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  P3D  = (complex_kind*)density;
  FN11 = (complex_kind*)N11;
  FN12 = (complex_kind*)N12;
  FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
  plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
  plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

 if(ThisTask == 0) {
    printf("Beginning timestepping\n");
    printf("======================\n");
    fflush(stdout);
  }
  
  // AI stores the scale factor to which the velocities have been kicked to. Initially it's just A.
  AI=A;
  for (timeStep=0;timeStep<=nsteps;timeStep++){
    
    // AFF is the scale factor to which we should drift the particle positions.
    // AF is the scale factor to which we should kick the particle velocities.
    if (stepDistr == 0) AFF=A+da;
    if (stepDistr == 1) AFF=A*exp(da);
    if (stepDistr == 2) AFF=AofTime(CosmoTime(A)+da);

    // half time-step for final kick
    if (timeStep == nsteps) {
      AF=A; 
    } else { 
      // Set to mid-point of interval. In the infinitesimal timestep limit, these choices are identical. 
      // How one chooses the mid-point when not in that limit is really an extra degree of freedom in the code 
      // but Tassev et al. report negligible effects from the different choices below. 
      // Hence, this is not exported as an extra switch at this point.
      if (stepDistr == 0) AF=A+da*0.5;
      if (stepDistr == 1) AF=A*exp(da*0.5);
      if (stepDistr == 2) AF=AofTime((CosmoTime(AFF)+CosmoTime(A))*0.5); 
    }
    
    if (ThisTask == 0) {
      printf("Iteration = %d\n------------------\n",timeStep+1);
      printf("a = %lf\n",A);
      printf("z = %lf\n",1.0/A-1.0);
      fflush(stdout);
    }

    // First we check whether all the particles are on the correct processor after the last time step/
    // original 2LPT displacement and move them if not
    if (ThisTask == 0) printf("Moving particles across task boundaries...\n");
    MoveParticles();

#ifdef MEMORY_MODE
    density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    P3D  = (complex_kind*)density;
#ifdef SINGLE_PRECISION
    plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

    // Then we do the Cloud-in-Cell assignment to get the density grid and FFT it.  
    if (ThisTask == 0) printf("Calculating density using Cloud-in-Cell...\n");
    PtoMesh();

#ifdef MEMORY_MODE
    N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    FN11 = (complex_kind*)N11;
    FN12 = (complex_kind*)N12;
    FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
    p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif
    
    // This returns N11,N12,N13 which hold the components of
    // the vector (grad grad^{-2} density) on a grid.
    if (ThisTask == 0) printf("Calculating forces...\n");
    Forces(); 

#ifdef MEMORY_MODE
    free(density);
    for (i=0; i<3; i++) Disp[i] = (float *)malloc(NumPart*sizeof(float));
#ifdef SINGLE_PRECISION
    fftwf_destroy_plan(plan);
#else 
    fftw_destroy_plan(plan);
#endif
#else
    for (i=0; i<3; i++) Disp[i] = (float_kind *)malloc(NumPart*sizeof(float_kind));
#endif
    
    // Now find the accelerations at the particle positions using 3-linear interpolation. 
    if (ThisTask == 0) printf("Calculating accelerations...\n");
    MtoParticles();

#ifdef MEMORY_MODE
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif
    
    // Calculate the mean displacement and subtract later.
    if (ThisTask == 0) printf("Calculating mean of displacements...\n");
    double sumDx=0,sumDy=0,sumDz=0;
    for(n=0; n<NumPart; n++) {
      sumDx += Disp[0][n];
      sumDy += Disp[1][n];
      sumDz += Disp[2][n];
    }

    // Make sumDx, sumDy and sumDz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumDx /= (double)TotNumPart; // We will subtract these below to conserve momentum. 
    sumDy /= (double)TotNumPart;
    sumDz /= (double)TotNumPart; 

    if (ThisTask == 0) {
      printf("Kicking the particles...\n");
      fflush(stdout);
    }

    // Kick
    // ===============
    double dda;
    double q1,q2;
    double ax,ay,az;
    double sumx=0,sumy=0,sumz=0; 
    double Om143=pow(Omega/(Omega+(1-Omega)*A*A*A),1./143.);
    
    if (StdDA == 0) {
      dda=Sphi(AI,AF,A);
    } else if (StdDA == 1) {
      dda=(AF-AI)*A/Qfactor(A);
    } else {
      dda=SphiStd(AI,AF);
    }  
    
    q2=1.5*Omega*growth1*growth1*(1.0+7./3.*Om143)*A; // T^2[D_{2lpt}]=d^2 D_{2lpt}/dy^2
    q1=1.5*Omega*growth1*A;                           // T^2[D_{ZA}]=d^2 D_{ZA}/dy^2
    
    for(n=0; n<NumPart; n++) {

      Disp[0][n] -= sumDx;
      Disp[1][n] -= sumDy;
      Disp[2][n] -= sumDz;

      ax=-1.5*Omega*Disp[0][n]-subtractLPT*(P[n].Dz[0]*q1+P[n].D2[0]*q2)/A;
      ay=-1.5*Omega*Disp[1][n]-subtractLPT*(P[n].Dz[1]*q1+P[n].D2[1]*q2)/A;
      az=-1.5*Omega*Disp[2][n]-subtractLPT*(P[n].Dz[2]*q1+P[n].D2[2]*q2)/A;

      P[n].Vel[0] += ax*dda;
      P[n].Vel[1] += ay*dda;
      P[n].Vel[2] += az*dda;

      sumx += P[n].Vel[0];
      sumy += P[n].Vel[1];
      sumz += P[n].Vel[2];
    }

    for (i=0; i<3; i++) free(Disp[i]);

    // Make sumx, sumy and sumz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumx /= (double)TotNumPart;  // We will subtract these below to conserve momentum. 
    sumy /= (double)TotNumPart;  // Should be conserved, but just in case 3-linear interpolation makes a problem.
    sumz /= (double)TotNumPart;  // Never checked whether this makes a difference.

    if (timeStep == nsteps) {

      if (ThisTask == 0) {
        printf("Iteration %d finished\n------------------\n\n", timeStep+1);
        printf("Timestepping finished\n\n");
        fflush(stdout);
      }
    
      // At final timestep, add back LPT velocities if we had subtracted them. 
      // This corresponds to L_+ operator in TZE.
      Dv  = DprimeQ(A,1.0);  // dD_{za}/dy
      Dv2 = growthD2v(A);    // dD_{2lpt}/dy

      for(n=0; n<NumPart; n++) {
        P[n].Vel[0] += -sumx+(P[n].Dz[0]*Dv+P[n].D2[0]*Dv2)*subtractLPT;
        P[n].Vel[1] += -sumy+(P[n].Dz[1]*Dv+P[n].D2[1]*Dv2)*subtractLPT;
        P[n].Vel[2] += -sumz+(P[n].Dz[2]*Dv+P[n].D2[2]*Dv2)*subtractLPT;
      }

      goto finalize; // Sorry for "goto" :)
    }
    
    if (ThisTask == 0) {
      printf("Drifting the particles...\n");
      fflush(stdout);
    }

    // Drift
    // =============
    double dyyy;
    double da1,da2;

    AC = AF;
    AF = AFF;
    
    if (StdDA == 0) {
      dyyy=Sq(A,AF,AC);
    } else if (StdDA == 1) {
      dyyy=(AF-A)/Qfactor(AC);
    } else {
      dyyy=SqStd(A,AF);
    }

    da1=growthD(1.0, AF)-growth1;    // change in D
    da2=growthD2(AF)-growth1L2; // change in D_{2lpt}
    
    for(n=0; n<NumPart; n++) {
        P[n].Pos[0] += (P[n].Vel[0]-sumx)*dyyy;
        P[n].Pos[1] += (P[n].Vel[1]-sumy)*dyyy;
        P[n].Pos[2] += (P[n].Vel[2]-sumz)*dyyy;

        P[n].Pos[0] = periodic_wrap(P[n].Pos[0]+subtractLPT*(P[n].Dz[0]*da1+P[n].D2[0]*da2));
        P[n].Pos[1] = periodic_wrap(P[n].Pos[1]+subtractLPT*(P[n].Dz[1]*da1+P[n].D2[1]*da2));
        P[n].Pos[2] = periodic_wrap(P[n].Pos[2]+subtractLPT*(P[n].Dz[2]*da1+P[n].D2[2]*da2));
    }

    // Step in time
    // ================
    A  = AF;   // WRT to the above name change, A  = AFF
    AI = AC;   // WRT to the above name change, AI = AF

    growth1   = growthD(1.0, A);
    growth1L2 = growthD2(A);

    if (ThisTask == 0) {
      printf("Iteration %d finished\n------------------\n\n", timeStep+1);
      fflush(stdout);
    }
     
    ierr = MPI_Barrier(MPI_COMM_WORLD);

  }

  // Here is the last little bit
  // ===========================
  finalize:

  if (ThisTask == 0) {
    printf("Finishing up\n");
    printf("============\n");
    fflush(stdout);
  }
    
  // Now convert velocities to v_{rsd}\equiv (ds/d\eta)/(a H(a))
  velRSD(A);
    
  // Output a slice just for the sake of doing something with P.
  if (ThisTask == 0) {
    printf("Converting to RSD velocities...\n");
    printf("Outputting particles...\n"); 
  }
  slice();
  print_spec();
  fflush(stdout);

  free_powertable();
  free_transfertable();
#ifdef GENERIC_FNL
  free(KernelTable);
#endif

  free(P);
  free(Slab_to_task);
  free(Part_to_task);
  free(Local_nx_table);
  free(Local_np_table);
#ifndef MEMORY_MODE
  free(density);
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(plan);
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(plan);
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif

#ifdef SINGLE_PRECISION
  fftwf_mpi_cleanup();
#else
  fftw_mpi_cleanup();
#endif

  if (ThisTask == 0) printf("Done :)\n");

  MPI_Finalize();   

  return 0;
}
コード例 #5
0
ファイル: pm_periodic.c プロジェクト: huilin2014/cuda-gadget
/*! Calculates the long-range potential using the PM method.  The potential is
 *  Gaussian filtered with Asmth, given in mesh-cell units. We carry out a CIC
 *  charge assignment, and compute the potenial by Fourier transform
 *  methods. The CIC kernel is deconvolved.
 */
void pmpotential_periodic(void)
{
  double k2, kx, ky, kz, smth;
  double dx, dy, dz;
  double fx, fy, fz, ff;
  double asmth2, fac;
  int i, j, slab, level, sendTask, recvTask;
  int x, y, z, ip;
  int slab_x, slab_y, slab_z;
  int slab_xx, slab_yy, slab_zz;
  int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax;
  int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2];
  int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz;
  MPI_Status status;

  if(ThisTask == 0)
    {
      printf("Starting periodic PM calculation.\n");
      fflush(stdout);
    }

  asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize;
  asmth2 *= asmth2;

  fac = All.G / (M_PI * All.BoxSize);	/* to get potential */

  force_treefree();

  /* first, establish the extension of the local patch in the PMGRID  */

  for(j = 0; j < 3; j++)
    {
      meshmin[j] = PMGRID;
      meshmax[j] = 0;
    }

  for(i = 0; i < NumPart; i++)
    {
      for(j = 0; j < 3; j++)
	{
	  slab = to_slab_fac * P[i].Pos[j];
	  if(slab >= PMGRID)
	    slab = PMGRID - 1;

	  if(slab < meshmin[j])
	    meshmin[j] = slab;

	  if(slab > meshmax[j])
	    meshmax[j] = slab;
	}
    }

  MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD);
  MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD);

  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4));

  #ifdef FFTW3
  /* Create plan for in-place r2c DFT */
  fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid,
  												MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT);
  fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid,
  												MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN);
  #endif

  for(i = 0; i < dimx * dimy * dimz; i++)
    workspace[i] = 0;

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz);
      workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz;
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz;

      workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz;
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz;
    }


  for(i = 0; i < fftsize; i++)	/* clear local density field */
    rhogrid[i] = 0;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;
      if(recvTask < NTask)
	{
	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -1;
	  for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == recvTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -1)
	    sendmin = 0;

	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -1;
	  for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == sendTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -1)
	    recvmin = 0;


	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2;

	      if(level > 0)
		{
		  MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz,
			       (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask,
			       TAG_PERIODIC_C, forcegrid,
			       (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE,
			       recvTask, TAG_PERIODIC_C, MPI_COMM_WORLD, &status);
		}
	      else
		{
		  memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz,
			 (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real));
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask];

		  if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask])
		    {
		      for(slab_y = meshmin_list[3 * recvTask + 1];
			  slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++)
			{
			  slab_yy = slab_y;
			  if(slab_yy >= PMGRID)
			    slab_yy -= PMGRID;

			  for(slab_z = meshmin_list[3 * recvTask + 2];
			      slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++)
			    {
			      slab_zz = slab_z;
			      if(slab_zz >= PMGRID)
				slab_zz -= PMGRID;

			      rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] +=
				forcegrid[((slab_x - recvmin) * recv_dimy +
					   (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz +
					  (slab_z - meshmin_list[3 * recvTask + 2])];
			    }
			}
		    }
		}
	    }
	}
    }



  /* Do the FFT of the density field */
  #ifdef FFTW3
  fftw_execute(fft_forward_plan);
  #else
  rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  /* multiply with Green's function for the potential */

  for(y = slabstart_y; y < slabstart_y + nslab_y; y++)
    for(x = 0; x < PMGRID; x++)
      for(z = 0; z < PMGRID / 2 + 1; z++)
	{
	  if(x > PMGRID / 2)
	    kx = x - PMGRID;
	  else
	    kx = x;
	  if(y > PMGRID / 2)
	    ky = y - PMGRID;
	  else
	    ky = y;
	  if(z > PMGRID / 2)
	    kz = z - PMGRID;
	  else
	    kz = z;

	  k2 = kx * kx + ky * ky + kz * kz;

	  if(k2 > 0)
	    {
	      smth = -exp(-k2 * asmth2) / k2 * fac;
	      /* do deconvolution */
	      fx = fy = fz = 1;
	      if(kx != 0)
		{
		  fx = (M_PI * kx) / PMGRID;
		  fx = sin(fx) / fx;
		}
	      if(ky != 0)
		{
		  fy = (M_PI * ky) / PMGRID;
		  fy = sin(fy) / fy;
		}
	      if(kz != 0)
		{
		  fz = (M_PI * kz) / PMGRID;
		  fz = sin(fz) / fz;
		}
	      ff = 1 / (fx * fy * fz);
	      smth *= ff * ff * ff * ff;
	      /* end deconvolution */

	      ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z;
	      c_re(fft_of_rhogrid[ip]) *= smth;
	      c_im(fft_of_rhogrid[ip]) *= smth;
	    }
	}

  if(slabstart_y == 0)
    c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0;

  /* Do the FFT to get the potential */
  #ifdef FFTW3
  fftw_execute(fft_inverse_plan);
  
  /* Now normalize the output */
  for(i = 0; i < fftsize; i++)
  	rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID);
  	
  #else
  rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  /* note: "rhogrid" now contains the potential */



  dimx = meshmax[0] - meshmin[0] + 6;
  dimy = meshmax[1] - meshmin[1] + 6;
  dimz = meshmax[2] - meshmin[2] + 6;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;

      if(recvTask < NTask)
	{

	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -PMGRID;
	  for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -PMGRID)
	    sendmin = sendmax + 1;


	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -PMGRID;
	  for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -PMGRID)
	    recvmin = recvmax + 1;

	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6;

	      ncont = 1;
	      cont_sendmin[0] = sendmin;
	      cont_sendmax[0] = sendmax;
	      cont_sendmin[1] = sendmax + 1;
	      cont_sendmax[1] = sendmax;

	      cont_recvmin[0] = recvmin;
	      cont_recvmax[0] = recvmax;
	      cont_recvmin[1] = recvmax + 1;
	      cont_recvmax[1] = recvmax;

	      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
		    {
		      /* non-contiguous */
		      cont_sendmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
			slab_x++;
		      cont_sendmin[1] = slab_x;
		      ncont++;
		    }
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
		    {
		      /* non-contiguous */
		      cont_recvmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
			slab_x++;
		      cont_recvmin[1] = slab_x;
		      if(ncont == 1)
			ncont++;
		    }
		}


	      for(rep = 0; rep < ncont; rep++)
		{
		  sendmin = cont_sendmin[rep];
		  sendmax = cont_sendmax[rep];
		  recvmin = cont_recvmin[rep];
		  recvmax = cont_recvmax[rep];

		  /* prepare what we want to send */
		  if(sendmax - sendmin >= 0)
		    {
		      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
			{
			  slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask];

			  for(slab_y = meshmin_list[3 * recvTask + 1] - 2;
			      slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++)
			    {
			      slab_yy = (slab_y + PMGRID) % PMGRID;

			      for(slab_z = meshmin_list[3 * recvTask + 2] - 2;
				  slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++)
				{
				  slab_zz = (slab_z + PMGRID) % PMGRID;

				  forcegrid[((slab_x - sendmin) * recv_dimy +
					     (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz +
					    slab_z - (meshmin_list[3 * recvTask + 2] - 2)] =
				    rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz];
				}
			    }
			}
		    }

		  if(level > 0)
		    {
		      MPI_Sendrecv(forcegrid,
				   (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real),
				   MPI_BYTE, recvTask, TAG_PERIODIC_D,
				   workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
				   (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE,
				   recvTask, TAG_PERIODIC_D, MPI_COMM_WORLD, &status);
		    }
		  else
		    {
		      memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
			     forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real));
		    }
		}
	    }
	}
    }


  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  recv_dimx = meshmax[0] - meshmin[0] + 6;
  recv_dimy = meshmax[1] - meshmin[1] + 6;
  recv_dimz = meshmax[2] - meshmin[2] + 6;



  for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++)
    for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++)
      for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++)
	{
	  forcegrid[(x * dimy + y) * dimz + z] =
	    workspace[((x + 2) * recv_dimy + (y + 2)) * recv_dimz + (z + 2)];
	}


  /* read out the potential */

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      P[i].Potential +=
	forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz;
      P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz;

      P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz;
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz;
    }

  pm_init_periodic_free();
  force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart);

  All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency;

  if(ThisTask == 0)
    {
      printf("done PM-Potential.\n");
      fflush(stdout);
    }
}
コード例 #6
0
ファイル: pm_periodic.c プロジェクト: huilin2014/cuda-gadget
/*! Calculates the long-range periodic force given the particle positions
 *  using the PM method.  The force is Gaussian filtered with Asmth, given in
 *  mesh-cell units. We carry out a CIC charge assignment, and compute the
 *  potenial by Fourier transform methods. The potential is finite differenced
 *  using a 4-point finite differencing formula, and the forces are
 *  interpolated tri-linearly to the particle positions. The CIC kernel is
 *  deconvolved. Note that the particle distribution is not in the slab
 *  decomposition that is used for the FFT. Instead, overlapping patches
 *  between local domains and FFT slabs are communicated as needed.
 */
void pmforce_periodic(void)
{
  double k2, kx, ky, kz, smth;
  double dx, dy, dz;
  double fx, fy, fz, ff;
  double asmth2, fac, acc_dim;
  int i, j, slab, level, sendTask, recvTask;
  int x, y, z, xl, yl, zl, xr, yr, zr, xll, yll, zll, xrr, yrr, zrr, ip, dim;
  int slab_x, slab_y, slab_z;
  int slab_xx, slab_yy, slab_zz;
  int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax;
  int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2];
  int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz;
  MPI_Status status;


  if(ThisTask == 0)
    {
      printf("Starting periodic PM calculation.\n");
      fflush(stdout);
    }

  #ifdef FFTW3
  if(fftw_plan_exists)
  {
	  /* macro defined in callgrind.h */
	  // CALLGRIND_START_INSTRUMENTATION;
  }
  #else
  // CALLGRIND_START_INSTRUMENTATION;
  #endif

  force_treefree();


  asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize;
  asmth2 *= asmth2;

  fac = All.G / (M_PI * All.BoxSize);	/* to get potential */
  fac *= 1 / (2 * All.BoxSize / PMGRID);	/* for finite differencing */

  /* first, establish the extension of the local patch in the PMGRID  */

  for(j = 0; j < 3; j++)
    {
      meshmin[j] = PMGRID;
      meshmax[j] = 0;
    }

  for(i = 0; i < NumPart; i++)
    {
      for(j = 0; j < 3; j++)
	{
	  slab = to_slab_fac * P[i].Pos[j];
	  if(slab >= PMGRID)
	    slab = PMGRID - 1;

	  if(slab < meshmin[j])
	    meshmin[j] = slab;

	  if(slab > meshmax[j])
	    meshmax[j] = slab;
	}
    }

  MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD);
  MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD);

  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4));

  #ifdef FFTW3
  if(!fftw_plan_exists)
  {
  	/* Create plan for in-place r2c DFT */
  	fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid,
  												MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_OUT);
  	fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid,
  												MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_IN);
  	fftw_plan_exists = true;	// use C99 bool type
  	
  	if(ThisTask == 0)
  		printf("Created new FFTW3 plan.\n");
  } else {
  	/* do nothing, the plan has already been created by previous call to this function */
  }
  #endif

	/* For FFTW3, there is a different convention for fftsize for real-to-complex transforms, i.e.
		fftsize is the size of the complex data (number of complex values), NOT the size of the real data!
		We attempt to take care of this by defining fftsize to be fftsize_real when using FFTW3.  */

  for(i = 0; i < dimx * dimy * dimz; i++)
    workspace[i] = 0;

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz);
      workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz;
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz;

      workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz;
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz;
    }


  for(i = 0; i < fftsize; i++)	/* clear local density field */
    rhogrid[i] = 0;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;
      if(recvTask < NTask)
	{
	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -1;
	  for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == recvTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -1)
	    sendmin = 0;

	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -1;
	  for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == sendTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -1)
	    recvmin = 0;


	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2;

	      if(level > 0)
		{
		  MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz,
			       (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask,
			       TAG_PERIODIC_A, forcegrid,
			       (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE,
			       recvTask, TAG_PERIODIC_A, MPI_COMM_WORLD, &status);
		}
	      else
		{
		  memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz,
			 (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real));
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask];

		  if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask])
		    {
		      for(slab_y = meshmin_list[3 * recvTask + 1];
			  slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++)
			{
			  slab_yy = slab_y;
			  if(slab_yy >= PMGRID)
			    slab_yy -= PMGRID;

			  for(slab_z = meshmin_list[3 * recvTask + 2];
			      slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++)
			    {
			      slab_zz = slab_z;
			      if(slab_zz >= PMGRID)
				slab_zz -= PMGRID;

			      rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] +=
				forcegrid[((slab_x - recvmin) * recv_dimy +
					   (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz +
					  (slab_z - meshmin_list[3 * recvTask + 2])];
			    }
			}
		    }
		}
	    }
	}
    }

  #ifdef DEBUG_FFT
  
  double norm_density = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm_density += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of density: %f\n", norm_density);
 	 printf("First five values of density: %f, %f, %f, %f, %f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3], rhogrid[4]);
 	 
  }
  #endif

  /* Do the FFT of the density field */
  #ifdef FFTW3
  fftw_execute_dft_r2c(fft_forward_plan, rhogrid, fft_of_rhogrid);
  #else
  rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif

  #ifdef DEBUG_FFT
  
  double norm_complex = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm_complex += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of complex rhogrid: %f\n", norm_complex);
 	 printf("First two values of complex fft: %f + i*%f, %f + i*%f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3]);
 	 
  }
  #endif

  /* multiply with Green's function for the potential */

  for(y = slabstart_y; y < slabstart_y + nslab_y; y++)
    for(x = 0; x < PMGRID; x++)
      for(z = 0; z < PMGRID / 2 + 1; z++)
	{
	  if(x > PMGRID / 2)
	    kx = x - PMGRID;
	  else
	    kx = x;
	  if(y > PMGRID / 2)
	    ky = y - PMGRID;
	  else
	    ky = y;
	  if(z > PMGRID / 2)
	    kz = z - PMGRID;
	  else
	    kz = z;

	  k2 = kx * kx + ky * ky + kz * kz;

	  if(k2 > 0)
	    {
	      smth = -exp(-k2 * asmth2) / k2;

	      /* do deconvolution */

	      fx = fy = fz = 1;
	      if(kx != 0)
		{
		  fx = (M_PI * kx) / PMGRID;
		  fx = sin(fx) / fx;
		}
	      if(ky != 0)
		{
		  fy = (M_PI * ky) / PMGRID;
		  fy = sin(fy) / fy;
		}
	      if(kz != 0)
		{
		  fz = (M_PI * kz) / PMGRID;
		  fz = sin(fz) / fz;
		}
	      ff = 1 / (fx * fy * fz);
	      smth *= ff * ff * ff * ff;

	      /* end deconvolution */

	      ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z;
	      c_re(fft_of_rhogrid[ip]) *= smth;
	      c_im(fft_of_rhogrid[ip]) *= smth;
	    }
	}

  if(slabstart_y == 0)
    c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0;

  /* Do the FFT to get the potential */
  #ifdef FFTW3
  fftw_execute_dft_c2r(fft_inverse_plan, fft_of_rhogrid, rhogrid);
  
  /* Now normalize the output 
  for(i = 0; i < fftsize; i++)
	rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID); */
  #else
  rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  #ifdef DEBUG_FFT
  
  double norm = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of rhogrid: %f\n", norm);
 	 printf("First five values of fft: %f, %f, %f, %f, %f\n\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3],rhogrid[4]);
 	 
 	 /* fp = fopen("rhogrid.0", "wb"); /* add suffix to indicate which node this is */
 	 /* fwrite(rhogrid, sizeof(rhogrid[0]), fftsize, fp);*/
 	 /* fclose(fp); */
  }
  #endif

  /* Now rhogrid holds the potential */
  /* construct the potential for the local patch */


  dimx = meshmax[0] - meshmin[0] + 6;
  dimy = meshmax[1] - meshmin[1] + 6;
  dimz = meshmax[2] - meshmin[2] + 6;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;

      if(recvTask < NTask)
	{

	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -PMGRID;
	  for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -PMGRID)
	    sendmin = sendmax + 1;


	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -PMGRID;
	  for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -PMGRID)
	    recvmin = recvmax + 1;

	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6;

	      ncont = 1;
	      cont_sendmin[0] = sendmin;
	      cont_sendmax[0] = sendmax;
	      cont_sendmin[1] = sendmax + 1;
	      cont_sendmax[1] = sendmax;

	      cont_recvmin[0] = recvmin;
	      cont_recvmax[0] = recvmax;
	      cont_recvmin[1] = recvmax + 1;
	      cont_recvmax[1] = recvmax;

	      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
		    {
		      /* non-contiguous */
		      cont_sendmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
			slab_x++;
		      cont_sendmin[1] = slab_x;
		      ncont++;
		    }
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
		    {
		      /* non-contiguous */
		      cont_recvmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
			slab_x++;
		      cont_recvmin[1] = slab_x;
		      if(ncont == 1)
			ncont++;
		    }
		}


	      for(rep = 0; rep < ncont; rep++)
		{
		  sendmin = cont_sendmin[rep];
		  sendmax = cont_sendmax[rep];
		  recvmin = cont_recvmin[rep];
		  recvmax = cont_recvmax[rep];

		  /* prepare what we want to send */
		  if(sendmax - sendmin >= 0)
		    {
		      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
			{
			  slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask];

			  for(slab_y = meshmin_list[3 * recvTask + 1] - 2;
			      slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++)
			    {
			      slab_yy = (slab_y + PMGRID) % PMGRID;

			      for(slab_z = meshmin_list[3 * recvTask + 2] - 2;
				  slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++)
				{
				  slab_zz = (slab_z + PMGRID) % PMGRID;

				  forcegrid[((slab_x - sendmin) * recv_dimy +
					     (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz +
					    slab_z - (meshmin_list[3 * recvTask + 2] - 2)] =
				    rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz];
				}
			    }
			}
		    }

		  if(level > 0)
		    {
		      MPI_Sendrecv(forcegrid,
				   (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real),
				   MPI_BYTE, recvTask, TAG_PERIODIC_B,
				   workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
				   (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE,
				   recvTask, TAG_PERIODIC_B, MPI_COMM_WORLD, &status);
		    }
		  else
		    {
		      memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
			     forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real));
		    }
		}
	    }
	}
    }


  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  recv_dimx = meshmax[0] - meshmin[0] + 6;
  recv_dimy = meshmax[1] - meshmin[1] + 6;
  recv_dimz = meshmax[2] - meshmin[2] + 6;


  for(dim = 0; dim < 3; dim++)	/* Calculate each component of the force. */
    {
      /* get the force component by finite differencing the potential */
      /* note: "workspace" now contains the potential for the local patch, plus a suffiently large buffer region */

      for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++)
	for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++)
	  for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++)
	    {
	      xrr = xll = xr = xl = x;
	      yrr = yll = yr = yl = y;
	      zrr = zll = zr = zl = z;

	      switch (dim)
		{
		case 0:
		  xr = x + 1;
		  xrr = x + 2;
		  xl = x - 1;
		  xll = x - 2;
		  break;
		case 1:
		  yr = y + 1;
		  yl = y - 1;
		  yrr = y + 2;
		  yll = y - 2;
		  break;
		case 2:
		  zr = z + 1;
		  zl = z - 1;
		  zrr = z + 2;
		  zll = z - 2;
		  break;
		}

	      forcegrid[(x * dimy + y) * dimz + z]
		=
		fac * ((4.0 / 3) *
		       (workspace[((xl + 2) * recv_dimy + (yl + 2)) * recv_dimz + (zl + 2)]
			- workspace[((xr + 2) * recv_dimy + (yr + 2)) * recv_dimz + (zr + 2)]) -
		       (1.0 / 6) *
		       (workspace[((xll + 2) * recv_dimy + (yll + 2)) * recv_dimz + (zll + 2)] -
			workspace[((xrr + 2) * recv_dimy + (yrr + 2)) * recv_dimz + (zrr + 2)]));
	    }

      /* read out the forces */

      for(i = 0; i < NumPart; i++)
	{
	  slab_x = to_slab_fac * P[i].Pos[0];
	  if(slab_x >= PMGRID)
	    slab_x = PMGRID - 1;
	  dx = to_slab_fac * P[i].Pos[0] - slab_x;
	  slab_x -= meshmin[0];
	  slab_xx = slab_x + 1;

	  slab_y = to_slab_fac * P[i].Pos[1];
	  if(slab_y >= PMGRID)
	    slab_y = PMGRID - 1;
	  dy = to_slab_fac * P[i].Pos[1] - slab_y;
	  slab_y -= meshmin[1];
	  slab_yy = slab_y + 1;

	  slab_z = to_slab_fac * P[i].Pos[2];
	  if(slab_z >= PMGRID)
	    slab_z = PMGRID - 1;
	  dz = to_slab_fac * P[i].Pos[2] - slab_z;
	  slab_z -= meshmin[2];
	  slab_zz = slab_z + 1;

	  acc_dim =
	    forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
	  acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz);
	  acc_dim += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz;
	  acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz;

	  acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz);
	  acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz);
	  acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz;
	  acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz;

	  P[i].GravPM[dim] = acc_dim;
	}
    }

  pm_init_periodic_free();
  force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart);

  All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency;

  if(ThisTask == 0)
    {
      printf("done PM.\n");
      fflush(stdout);
    }
  
  #ifdef FFTW3
  if(fftw_plan_exists)
  {
	  /* macro defined in callgrind.h */
	  // CALLGRIND_STOP_INSTRUMENTATION;
  }
  #else
  // CALLGRIND_STOP_INSTRUMENTATION;
  #endif
}