void Grid::initialize(MPI_Comm comm, int nx_, int ny_, int nz_) { nx = nx_; ny = ny_; nz = nz_; fft_plan = rfftw3d_mpi_create_plan(comm, nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE); ifft_plan = rfftw3d_mpi_create_plan(comm, nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE); rfftwnd_mpi_local_sizes(fft_plan, &nxloc, &ixmin, &nyloc_t, &iymin_t, &local_size); #else void Grid::initialize(int nx_, int ny_, int nz_) { nx = nx_; ny = ny_; nz = nz_; rfftwnd_plan plan = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); rfftwnd_plan iplan = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); nxloc = nx; nyloc_t = ny; ixmin = iymin_t = 0; local_size = nx * ny * 2*(nz/2+1); #endif // HAVE_MPI /* Allocate extra storage so that each process can hold the boundary * layer from the adjacent process */ assert(local_size == nxloc*ny*2*(nz/2+1)); local_size = (nxloc+1)*ny*2*(nz/2+1); data = (fftw_real*) malloc(local_size*sizeof(fftw_real)); }
void init_field(int n_d, int *n, double *L, field_info *FFT) { ptrdiff_t n_x_local; ptrdiff_t i_x_start_local; ptrdiff_t n_y_transpose_local; ptrdiff_t i_y_start_transpose_local; ptrdiff_t *n_x_rank; int flag_active; int n_active; int min_size, max_size; SID_log("Initializing ", SID_LOG_OPEN); for(ptrdiff_t i_d = 0; i_d < n_d; i_d++) { if(i_d < (n_d - 1)) SID_log("%dx", SID_LOG_CONTINUE, n[i_d]); else SID_log("%d element %d-d FFT ", SID_LOG_CONTINUE, n[i_d], n_d); } SID_log("(%d byte precision)...", SID_LOG_CONTINUE, (int)sizeof(GBPREAL)); // Initialize FFT sizes FFT->n_d = n_d; FFT->n = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->L = (double *)SID_calloc(sizeof(double) * FFT->n_d); FFT->n_k_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->n_R_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_R_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_k_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_R_stop_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_k_stop_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->n[i_d] = n[i_d]; FFT->L[i_d] = L[i_d]; FFT->i_R_start_local[i_d] = 0; FFT->i_k_start_local[i_d] = 0; FFT->n_R_local[i_d] = FFT->n[i_d]; FFT->n_k_local[i_d] = FFT->n[i_d]; } FFT->n_k_local[FFT->n_d - 1] = FFT->n[FFT->n_d - 1] / 2 + 1; // Initialize FFTW // Create an integer version of FFT->n[] to pass to ..._create_plan int *n_int=(int *)SID_malloc(sizeof(int)*FFT->n_d); for(int i_d=0;i_d<FFT->n_d;i_d++) n_int[i_d]=(int)FFT->n[i_d]; #if FFTW_V2 #if USE_MPI int total_local_size_int; int n_x_local_int; int i_x_start_local_int; int n_y_transpose_local_int; int i_y_start_transpose_local_int; FFT->plan = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE); FFT->iplan = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE); rfftwnd_mpi_local_sizes(FFT->plan, &(n_x_local_int), &(i_x_start_local_int), &(n_y_transpose_local_int), &(i_y_start_transpose_local_int), &total_local_size_int); n_x_local = (ptrdiff_t)n_x_local_int; i_x_start_local = (ptrdiff_t)i_x_start_local_int; n_y_transpose_local = (ptrdiff_t)n_y_transpose_local_int; i_y_start_transpose_local = (ptrdiff_t)i_y_start_transpose_local_int; FFT->total_local_size = (size_t)total_local_size_int; #else FFT->total_local_size = 1; for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { if(i_d < FFT->n_d - 1) FFT->total_local_size *= FFT->n[i_d]; else FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1); } #if USE_DOUBLE FFT->plan = fftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); FFT->iplan = fftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #else FFT->plan = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); FFT->iplan = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #endif #endif #else #if USE_MPI #if USE_DOUBLE fftw_mpi_init(); FFT->total_local_size = fftw_mpi_local_size_many_transposed(FFT->n_d, FFT->n, 1, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, SID_COMM_WORLD->comm, &(n_x_local), &(i_x_start_local), &(n_y_transpose_local), &(i_y_start_transpose_local)); FFT->plan = fftw_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); FFT->iplan = fftw_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); #else fftwf_mpi_init(); FFT->total_local_size = fftwf_mpi_local_size_many_transposed(FFT->n_d, FFT->n, 1, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, SID_COMM_WORLD->comm, &(n_x_local), &(i_x_start_local), &(n_y_transpose_local), &(i_y_start_transpose_local)); FFT->plan = fftwf_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); FFT->iplan = fftwf_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); #endif #else FFT->total_local_size = 1; for(ptrdiff_t i_d=0; i_d < FFT->n_d; i_d++) { if(i_d < FFT->n_d - 1) FFT->total_local_size *= FFT->n[i_d]; else FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1); } #if USE_DOUBLE FFT->plan = fftw_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE); FFT->iplan = fftw_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE); #else FFT->plan = fftwf_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE); FFT->iplan = fftwf_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE); #endif #endif #endif SID_free(SID_FARG n_int); // Set empty slabs to start at 0 to make ignoring them simple. if(n_x_local == 0) i_x_start_local = 0; if(n_y_transpose_local == 0) i_y_start_transpose_local = 0; // Modify the local slab dimensions according to what FFTW chose. FFT->i_R_start_local[0] = i_x_start_local; FFT->n_R_local[0] = n_x_local; if(FFT->n_d > 1) { FFT->i_k_start_local[1] = i_y_start_transpose_local; FFT->n_k_local[1] = n_y_transpose_local; } // Allocate field #if USE_FFTW3 FFT->field_local = (gbpFFT_real *)fftwf_alloc_real(FFT->total_local_size); #else FFT->field_local = (gbpFFT_real *)SID_malloc(sizeof(gbpFFT_real)*FFT->total_local_size); #endif FFT->cfield_local = (gbpFFT_complex *)FFT->field_local; // Upper limits of slab decomposition for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->i_R_stop_local[i_d] = FFT->i_R_start_local[i_d] + FFT->n_R_local[i_d] - 1; FFT->i_k_stop_local[i_d] = FFT->i_k_start_local[i_d] + FFT->n_k_local[i_d] - 1; } // FFTW padding sizes if(FFT->n_d > 1) { FFT->pad_size_R = 2 * (FFT->n_R_local[FFT->n_d - 1] / 2 + 1) - FFT->n_R_local[FFT->n_d - 1]; FFT->pad_size_k = 0; } else { FFT->pad_size_R = 0; FFT->pad_size_k = 0; } // Number of elements (global and local) in the FFT ptrdiff_t i_d = 0; for(FFT->n_field = 1, FFT->n_field_R_local = 1, FFT->n_field_k_local = 1; i_d < FFT->n_d; i_d++) { FFT->n_field *= (size_t)FFT->n[i_d]; FFT->n_field_R_local *= (size_t)FFT->n_R_local[i_d]; FFT->n_field_k_local *= (size_t)FFT->n_k_local[i_d]; } // Clear the field clear_field(FFT); // Initialize the FFT's real-space grid FFT->R_field = (double **)SID_malloc(sizeof(double *) * FFT->n_d); FFT->dR = (double *)SID_malloc(sizeof(double *) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->R_field[i_d] = (double *)SID_malloc(sizeof(double) * (FFT->n[i_d] + 1)); FFT->dR[i_d] = FFT->L[i_d] / (double)(FFT->n[i_d]); for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++) FFT->R_field[i_d][i_i] = FFT->L[i_d] * ((double)i_i / (double)(FFT->n[i_d])); FFT->R_field[i_d][FFT->n[i_d]] = FFT->L[i_d]; } // Initialize the FFT's k-space grid FFT->k_field = (double **)SID_malloc(sizeof(double *) * FFT->n_d); FFT->dk = (double *)SID_malloc(sizeof(double *) * FFT->n_d); FFT->k_Nyquist = (double *)SID_malloc(sizeof(double *) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->k_field[i_d] = (double *)SID_malloc(sizeof(double) * FFT->n[i_d]); FFT->dk[i_d] = TWO_PI / FFT->L[i_d]; FFT->k_Nyquist[i_d] = TWO_PI * (double)(FFT->n[i_d]) / FFT->L[i_d] / 2.; for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++) { if(i_i >= FFT->n[i_d] / 2) FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i - FFT->n[i_d]) / FFT->L[i_d]; else FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i) / FFT->L[i_d]; } } // Flags FFT->flag_padded = GBP_FALSE; // Slab info FFT->slab.n_x_local = FFT->n_R_local[0]; FFT->slab.i_x_start_local = FFT->i_R_start_local[0]; FFT->slab.i_x_stop_local = FFT->i_R_stop_local[0]; FFT->slab.x_min_local = FFT->R_field[0][FFT->i_R_start_local[0]]; if(FFT->slab.n_x_local > 0) FFT->slab.x_max_local = FFT->R_field[0][FFT->i_R_stop_local[0] + 1]; else FFT->slab.x_max_local = FFT->slab.x_min_local; SID_Allreduce(&(FFT->slab.x_max_local), &(FFT->slab.x_max), 1, SID_DOUBLE, SID_MAX, SID_COMM_WORLD); #if USE_MPI // All ranks are not necessarily assigned any slices, so // we need to figure out what ranks are to the right and the left for // buffer exchanges n_x_rank = (ptrdiff_t *)SID_malloc(sizeof(ptrdiff_t) * SID.n_proc); n_x_rank[SID.My_rank] = (ptrdiff_t)FFT->slab.n_x_local; if(n_x_rank[SID.My_rank] > 0) flag_active = GBP_TRUE; else flag_active = GBP_FALSE; SID_Allreduce(&flag_active, &n_active, 1, SID_INT, SID_SUM, SID_COMM_WORLD); SID_Allreduce(&n_x_rank[SID.My_rank], &min_size, 1, SID_INT, SID_MIN, SID_COMM_WORLD); SID_Allreduce(&n_x_rank[SID.My_rank], &max_size, 1, SID_INT, SID_MAX, SID_COMM_WORLD); for(int i_rank = 0; i_rank < SID.n_proc; i_rank++) SID_Bcast(&(n_x_rank[i_rank]), 1, SID_INT, i_rank, SID_COMM_WORLD); FFT->slab.rank_to_right = -1; for(int i_rank = SID.My_rank + 1; i_rank < SID.My_rank + SID.n_proc && FFT->slab.rank_to_right < 0; i_rank++) { int j_rank = i_rank % SID.n_proc; if(n_x_rank[j_rank] > 0) FFT->slab.rank_to_right = j_rank; } if(FFT->slab.rank_to_right < 0) FFT->slab.rank_to_right = SID.My_rank; FFT->slab.rank_to_left = -1; for(int i_rank = SID.My_rank - 1; i_rank > SID.My_rank - SID.n_proc && FFT->slab.rank_to_left < 0; i_rank--) { int j_rank = i_rank; if(i_rank < 0) j_rank = i_rank + SID.n_proc; if(n_x_rank[j_rank] > 0) FFT->slab.rank_to_left = j_rank; } if(FFT->slab.rank_to_left < 0) FFT->slab.rank_to_left = SID.My_rank; free(n_x_rank); SID_log("(%d cores unused, min/max slab size=%d/%d)...", SID_LOG_CONTINUE, SID.n_proc - n_active, min_size, max_size); #else FFT->slab.rank_to_right = SID.My_rank; FFT->slab.rank_to_left = SID.My_rank; if(FFT->slab.n_x_local > 0) { flag_active = GBP_TRUE; n_active = 1; min_size = FFT->slab.n_x_local; max_size = FFT->slab.n_x_local; } else { flag_active = GBP_FALSE; n_active = 0; min_size = 0; max_size = 0; } #endif SID_log("Done.", SID_LOG_CLOSE); }
/*! Initialization of the non-periodic PM routines. The plan-files for FFTW * are created. Finally, the routine to set-up the non-periodic Greens * function is called. */ void pm_init_nonperiodic(void) { int i, slab_to_task_local[GRID]; double bytes_tot = 0; size_t bytes; /* Set up the FFTW plan files. */ fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, GRID, GRID, GRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, GRID, GRID, GRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); for(i = 0; i < GRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; MPI_Allreduce(slab_to_task_local, slab_to_task, GRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); slabs_per_task = malloc(NTask * sizeof(int)); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); #ifndef PERIODIC if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } #endif first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); /* now allocate memory to hold the FFT fields */ #if !defined(PERIODIC) if(!(kernel[0] = (fftw_real *) malloc(bytes = fftsize * sizeof(fftw_real)))) { printf("failed to allocate memory for `FFT-kernel[0]' (%g MB).\n", bytes / (1024.0 * 1024.0)); endrun(1); } bytes_tot += bytes; fft_of_kernel[0] = (fftw_complex *) kernel[0]; #endif #if defined(PLACEHIGHRESREGION) if(!(kernel[1] = (fftw_real *) malloc(bytes = fftsize * sizeof(fftw_real)))) { printf("failed to allocate memory for `FFT-kernel[1]' (%g MB).\n", bytes / (1024.0 * 1024.0)); endrun(1); } bytes_tot += bytes; fft_of_kernel[1] = (fftw_complex *) kernel[1]; #endif if(ThisTask == 0) printf("\nAllocated %g MByte for FFT kernel(s).\n\n", bytes_tot / (1024.0 * 1024.0)); }
/*! This routines generates the FFTW-plans to carry out the parallel FFTs * later on. Some auxiliary variables are also initialized. */ void pm_init_periodic(void) { int i; int slab_to_task_local[PMGRID]; All.Asmth[0] = ASMTH * All.BoxSize / PMGRID; All.Rcut[0] = RCUT * All.Asmth[0]; /* Set up the FFTW plan files. */ #ifndef NOMPI fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #else fft_forward_plan = rfftw3d_create_plan(PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_create_plan(PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #endif /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); for(i = 0; i < PMGRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; slabs_per_task = malloc(NTask * sizeof(int)); #ifndef NOMPI MPI_Allreduce(slab_to_task_local, slab_to_task, PMGRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&nslab_x, &smallest_slab, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); #else slab_to_task = slab_to_task_local; smallest_slab = nslab_x; slabs_per_task[0] = nslab_x; #endif if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); to_slab_fac = PMGRID / All.BoxSize; #ifndef NOMPI MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); #else maxfftsize = fftsize; #endif }
maxwell_data *create_maxwell_data(int nx, int ny, int nz, int *local_N, int *N_start, int *alloc_N, int num_bands, int max_fft_bands) { int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; maxwell_data *d = 0; int fft_data_size; n[0] = nx; n[1] = ny; n[2] = nz; #if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3) # error Non-FFTW FFTs are not currently supported. #endif #if defined(HAVE_FFTW) CHECK(sizeof(fftw_real) == sizeof(real), "floating-point type is inconsistent with FFTW!"); #endif CHK_MALLOC(d, maxwell_data, 1); d->nx = nx; d->ny = ny; d->nz = nz; d->max_fft_bands = MIN2(num_bands, max_fft_bands); maxwell_set_num_bands(d, num_bands); d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0; d->parity = NO_PARITY; d->last_dim_size = d->last_dim = n[rank - 1]; /* ----------------------------------------------------- */ d->nplans = 1; #ifndef HAVE_MPI d->local_nx = nx; d->local_ny = ny; d->local_x_start = d->local_y_start = 0; *local_N = *alloc_N = nx * ny * nz; *N_start = 0; d->other_dims = *local_N / d->last_dim; d->fft_data = 0; /* initialize it here for use in specific planner? */ # if defined(HAVE_FFTW3) d->nplans = 0; /* plans will be created as needed */ # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; # else d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; # endif # elif defined(HAVE_FFTW) # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); # else /* not SCALAR_COMPLEX */ d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); # endif /* not SCALAR_COMPLEX */ # endif /* HAVE_FFTW */ #else /* HAVE_MPI */ /* ----------------------------------------------------- */ # if defined(HAVE_FFTW3) { int i; ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start; CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->nplans = 0; /* plans will be created as needed */ for (i = 0; i < rank; ++i) np[i] = n[i]; # ifndef SCALAR_COMPLEX d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1); # endif fft_data_size = *alloc_N = FFTW(mpi_local_size_transposed)(rank, np, MPI_COMM_WORLD, &local_nx, &local_x_start, &local_ny, &local_y_start); # ifndef SCALAR_COMPLEX fft_data_size = (*alloc_N *= 2); // convert to # of real scalars # endif d->local_nx = local_nx; d->local_x_start = local_x_start; d->local_ny = local_ny; d->local_y_start = local_y_start; d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz); *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; d->other_dims = *local_N / d->last_dim; } # elif defined(HAVE_FFTW) CHECK(rank > 1, "rank < 2 MPI computations are not supported"); # ifdef SCALAR_COMPLEX d->iplans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); { int nt[3]; /* transposed dimensions for reverse FFT */ nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; d->plans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, nt, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); } fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->fft_output_size = nx * d->local_ny * nz; # else /* not SCALAR_COMPLEX */ CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->iplans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for the reverse transform here--we always pass the dimensions of the original real array, and rfftwnd_mpi assumes that if one transform is transposed, then the other is as well. */ d->plans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->last_dim_size = 2 * (d->last_dim / 2 + 1); if (rank == 2) d->fft_output_size = nx * d->local_ny * nz; else d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2); # endif /* not SCALAR_COMPLEX */ *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; *alloc_N = *local_N; d->other_dims = *local_N / d->last_dim; # endif /* HAVE_FFTW */ #endif /* HAVE_MPI */ /* ----------------------------------------------------- */ #ifdef HAVE_FFTW CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed"); #endif CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size); /* A scratch output array is required because the "ordinary" arrays are not in a cartesian basis (or even a constant basis). */ fft_data_size *= d->max_fft_bands; #if defined(HAVE_FFTW3) d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size); CHECK(d->fft_data, "out of memory!"); d->fft_data2 = d->fft_data; /* works in-place */ #else CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size); d->fft_data2 = d->fft_data; /* works in-place */ #endif CHK_MALLOC(d->k_plus_G, k_data, *local_N); CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N); d->eps_inv_mean = 1.0; d->local_N = *local_N; d->N_start = *N_start; d->alloc_N = *alloc_N; d->N = nx * ny * nz; return d; }
/*! This routines generates the FFTW-plans to carry out the parallel FFTs * later on. Some auxiliary variables are also initialized. */ void pm_init_periodic(void) { int i; int slab_to_task_local[PMGRID]; All.Asmth[0] = ASMTH * All.BoxSize / PMGRID; All.Rcut[0] = RCUT * All.Asmth[0]; /* Initialize FFTW MPI */ #ifdef FFTW3 fftw_mpi_init(); #endif #ifdef FFTW3 /* If using FFTW3, don't create plans yet, just figure out the local array sizes */ fftsize_complex = fftw_mpi_local_size_3d_transposed(PMGRID, PMGRID, 0.5*PMGRID2, MPI_COMM_WORLD, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y); fftsize_real = 2.*fftsize_complex; fftw_plan_exists = false; #else /* Set up the FFTW plan files. */ fft_forward_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); fft_inverse_plan = rfftw3d_mpi_create_plan(MPI_COMM_WORLD, PMGRID, PMGRID, PMGRID, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Workspace out the ranges on each processor. */ rfftwnd_mpi_local_sizes(fft_forward_plan, &nslab_x, &slabstart_x, &nslab_y, &slabstart_y, &fftsize); #endif for(i = 0; i < PMGRID; i++) slab_to_task_local[i] = 0; for(i = 0; i < nslab_x; i++) slab_to_task_local[slabstart_x + i] = ThisTask; MPI_Allreduce(slab_to_task_local, slab_to_task, PMGRID, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&nslab_x, &smallest_slab, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); slabs_per_task = malloc(NTask * sizeof(int)); MPI_Allgather(&nslab_x, 1, MPI_INT, slabs_per_task, 1, MPI_INT, MPI_COMM_WORLD); if(ThisTask == 0) { for(i = 0; i < NTask; i++) printf("Task=%d FFT-Slabs=%d\n", i, slabs_per_task[i]); } first_slab_of_task = malloc(NTask * sizeof(int)); MPI_Allgather(&slabstart_x, 1, MPI_INT, first_slab_of_task, 1, MPI_INT, MPI_COMM_WORLD); meshmin_list = malloc(3 * NTask * sizeof(int)); meshmax_list = malloc(3 * NTask * sizeof(int)); to_slab_fac = PMGRID / All.BoxSize; MPI_Allreduce(&fftsize, &maxfftsize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); }