void Grid::initialize(MPI_Comm comm, int nx_, int ny_, int nz_) { nx = nx_; ny = ny_; nz = nz_; fft_plan = rfftw3d_mpi_create_plan(comm, nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE); ifft_plan = rfftw3d_mpi_create_plan(comm, nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE); rfftwnd_mpi_local_sizes(fft_plan, &nxloc, &ixmin, &nyloc_t, &iymin_t, &local_size); #else void Grid::initialize(int nx_, int ny_, int nz_) { nx = nx_; ny = ny_; nz = nz_; rfftwnd_plan plan = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); rfftwnd_plan iplan = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); nxloc = nx; nyloc_t = ny; ixmin = iymin_t = 0; local_size = nx * ny * 2*(nz/2+1); #endif // HAVE_MPI /* Allocate extra storage so that each process can hold the boundary * layer from the adjacent process */ assert(local_size == nxloc*ny*2*(nz/2+1)); local_size = (nxloc+1)*ny*2*(nz/2+1); data = (fftw_real*) malloc(local_size*sizeof(fftw_real)); }
/*! Initialization of the non-periodic PM routines. The plan-files for FFTW * are created. Finally, the routine to set-up the non-periodic Greens * function is called. */ void pm_init_nonperiodic(void) { int i, slab_to_task_local[GRID]; double bytes_tot = 0; size_t bytes; /* Set up the FFTW plan files. */ fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, GRID, GRID, GRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, GRID, GRID, GRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); for(i = 0; i < GRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; MPI_Allreduce(slab_to_task_local, slab_to_task, GRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); slabs_per_task = malloc(NTask * sizeof(int)); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); #ifndef PERIODIC if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } #endif first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); /* now allocate memory to hold the FFT fields */ #if !defined(PERIODIC) if(!(kernel[0] = (fftw_real *) malloc(bytes = fftsize * sizeof(fftw_real)))) { printf("failed to allocate memory for `FFT-kernel[0]' (%g MB).\n", bytes / (1024.0 * 1024.0)); endrun(1); } bytes_tot += bytes; fft_of_kernel[0] = (fftw_complex *) kernel[0]; #endif #if defined(PLACEHIGHRESREGION) if(!(kernel[1] = (fftw_real *) malloc(bytes = fftsize * sizeof(fftw_real)))) { printf("failed to allocate memory for `FFT-kernel[1]' (%g MB).\n", bytes / (1024.0 * 1024.0)); endrun(1); } bytes_tot += bytes; fft_of_kernel[1] = (fftw_complex *) kernel[1]; #endif if(ThisTask == 0) printf("\nAllocated %g MByte for FFT kernel(s).\n\n", bytes_tot / (1024.0 * 1024.0)); }
/*! This routines generates the FFTW-plans to carry out the parallel FFTs * later on. Some auxiliary variables are also initialized. */ void pm_init_periodic(void) { int i; int slab_to_task_local[PMGRID]; All.Asmth[0] = ASMTH * All.BoxSize / PMGRID; All.Rcut[0] = RCUT * All.Asmth[0]; /* Set up the FFTW plan files. */ #ifndef NOMPI fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #else fft_forward_plan = rfftw3d_create_plan(PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_create_plan(PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #endif /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); for(i = 0; i < PMGRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; slabs_per_task = malloc(NTask * sizeof(int)); #ifndef NOMPI MPI_Allreduce(slab_to_task_local, slab_to_task, PMGRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&nslab_x, &smallest_slab, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); #else slab_to_task = slab_to_task_local; smallest_slab = nslab_x; slabs_per_task[0] = nslab_x; #endif if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); to_slab_fac = PMGRID / All.BoxSize; #ifndef NOMPI MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); #else maxfftsize = fftsize; #endif }
/*! This routines generates the FFTW-plans to carry out the parallel FFTs * later on. Some auxiliary variables are also initialized. */ void pm_init_periodic(void) { int i; int slab_to_task_local[PMGRID]; All.Asmth[0] = ASMTH * All.BoxSize / PMGRID; All.Rcut[0] = RCUT * All.Asmth[0]; /* Initialize FFTW MPI */ #ifdef FFTW3 fftw_mpi_init(); #endif #ifdef FFTW3 /* If using FFTW3, don't create plans yet, just figure out the local array sizes */ fftsize_complex = fftw_mpi_local_size_3d_transposed(PMGRID, PMGRID, 0.5*PMGRID2, MPI_COMM_WORLD, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y); fftsize_real = 2.*fftsize_complex; fftw_plan_exists = false; #else /* Set up the FFTW plan files. */ fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); #endif for(i = 0; i < PMGRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; MPI_Allreduce(slab_to_task_local, slab_to_task, PMGRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&nslab_x, &smallest_slab, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); slabs_per_task = malloc(NTask * sizeof(int)); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); to_slab_fac = PMGRID / All.BoxSize; MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); }