int mcfft3_init(int pad1 /* padding on the first axis */, int nx, int ny, int nz /* input data size */, int *nx2, int *ny2, int *nz2 /* padded data size */, int *n_local, int *o_local /* local size & start */) /*< initialize >*/ { int cpuid; MPI_Comm_rank(MPI_COMM_WORLD, &cpuid); if (threads_ok) threads_ok = fftwf_init_threads(); fftwf_mpi_init(); if (false) sf_warning("Using threaded FFTW3! \n"); if (threads_ok) fftwf_plan_with_nthreads(omp_get_max_threads()); /* axis 1 */ nk = n1 = kiss_fft_next_fast_size(nx*pad1); /* axis 2 */ n2 = kiss_fft_next_fast_size(ny); /* axis 3 */ n3 = kiss_fft_next_fast_size(nz); alloc_local = fftwf_mpi_local_size_3d(n3, n2, n1, MPI_COMM_WORLD, &local_n0, &local_0_start); //cc = sf_complexalloc3(n1,n2,n3); cc = sf_complexalloc(alloc_local); cfg = fftwf_mpi_plan_dft_3d(n3,n2,n1, (fftwf_complex *) cc, (fftwf_complex *) cc, MPI_COMM_WORLD, FFTW_FORWARD, FFTW_MEASURE); icfg = fftwf_mpi_plan_dft_3d(n3,n2,n1, (fftwf_complex *) cc, (fftwf_complex *) cc, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_MEASURE); if (NULL == cfg || NULL == icfg) sf_error("FFTW failure."); *nx2 = n1; *ny2 = n2; *nz2 = n3; *n_local = (int) local_n0; *o_local = (int) local_0_start; wt = 1.0/(n3*n2*n1); return (nk*n2*n3); }
int main(int argc, char **argv) { fftwf_plan plan; fftwf_complex *data; ptrdiff_t alloc_local, local_n0, local_0_start, i, j; if (argc != 2) { printf("usage: ./fft_mpi MATRIX_SIZE\n"); exit(1); } const ptrdiff_t N0 = atoi(argv[1]); const ptrdiff_t N1 = N0; int id; double startTime, totalTime; totalTime = 0; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &id); fftwf_mpi_init(); /* get local data size and allocate */ alloc_local = fftwf_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD, &local_n0, &local_0_start); data = fftwf_alloc_complex(alloc_local);//(fftwf_complex *) fftwf_malloc(sizeof(fftw_complex) * alloc_local); /* create plan for in-place forward DFT */ plan = fftwf_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD, FFTW_FORWARD, FFTW_ESTIMATE); /* initialize data to some function my_function(x,y) */ for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j){ data[i*N1 + j][0] = local_0_start;;//my_function(local_0_start + i, j); data[i*N1 + j][1]=i; } /* compute transforms, in-place, as many times as desired */ MPI_Barrier(MPI_COMM_WORLD); if (id == 0) { startTime = getTime(); } fftwf_execute(plan); MPI_Barrier(MPI_COMM_WORLD); if (id == 0) { totalTime += getTime() - startTime; } fftwf_destroy_plan(plan); fftwf_mpi_cleanup(); if (id == 0) { printf("%.5f\n", totalTime); } MPI_Finalize(); return 0; }
int cfft2_init(int pad1 /* padding on the first axis */, int nx, int ny /* input data size */, int *nx2, int *ny2 /* padded data size */, int *n_local, int *o_local /* local size & start */, MPI_Comm comm) /*< initialize >*/ { if (threads_ok) threads_ok = fftwf_init_threads(); fftwf_mpi_init(); if (false) sf_warning("Using threaded FFTW3! \n"); if (threads_ok) fftwf_plan_with_nthreads(omp_get_max_threads()); nk = n1 = kiss_fft_next_fast_size(nx*pad1); n2 = kiss_fft_next_fast_size(ny); alloc_local = fftwf_mpi_local_size_2d(n2, n1, comm, &local_n0, &local_0_start); //cc = sf_complexalloc2(n1,n2); //dd = sf_complexalloc2(nk,n2); cc = sf_complexalloc(alloc_local); dd = sf_complexalloc(alloc_local); cfg = fftwf_mpi_plan_dft_2d(n2,n1, (fftwf_complex *) cc, (fftwf_complex *) dd, comm, FFTW_FORWARD, FFTW_MEASURE); icfg = fftwf_mpi_plan_dft_2d(n2,n1, (fftwf_complex *) dd, (fftwf_complex *) cc, comm, FFTW_BACKWARD, FFTW_MEASURE); if (NULL == cfg || NULL == icfg) sf_error("FFTW failure."); *nx2 = n1; *ny2 = n2; *n_local = (int) local_n0; *o_local = (int) local_0_start; wt = 1.0/(n1*n2); return (nk*n2); }
void init_field(int n_d, int *n, double *L, field_info *FFT) { ptrdiff_t n_x_local; ptrdiff_t i_x_start_local; ptrdiff_t n_y_transpose_local; ptrdiff_t i_y_start_transpose_local; ptrdiff_t *n_x_rank; int flag_active; int n_active; int min_size, max_size; SID_log("Initializing ", SID_LOG_OPEN); for(ptrdiff_t i_d = 0; i_d < n_d; i_d++) { if(i_d < (n_d - 1)) SID_log("%dx", SID_LOG_CONTINUE, n[i_d]); else SID_log("%d element %d-d FFT ", SID_LOG_CONTINUE, n[i_d], n_d); } SID_log("(%d byte precision)...", SID_LOG_CONTINUE, (int)sizeof(GBPREAL)); // Initialize FFT sizes FFT->n_d = n_d; FFT->n = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->L = (double *)SID_calloc(sizeof(double) * FFT->n_d); FFT->n_k_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->n_R_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_R_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_k_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_R_stop_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); FFT->i_k_stop_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->n[i_d] = n[i_d]; FFT->L[i_d] = L[i_d]; FFT->i_R_start_local[i_d] = 0; FFT->i_k_start_local[i_d] = 0; FFT->n_R_local[i_d] = FFT->n[i_d]; FFT->n_k_local[i_d] = FFT->n[i_d]; } FFT->n_k_local[FFT->n_d - 1] = FFT->n[FFT->n_d - 1] / 2 + 1; // Initialize FFTW // Create an integer version of FFT->n[] to pass to ..._create_plan int *n_int=(int *)SID_malloc(sizeof(int)*FFT->n_d); for(int i_d=0;i_d<FFT->n_d;i_d++) n_int[i_d]=(int)FFT->n[i_d]; #if FFTW_V2 #if USE_MPI int total_local_size_int; int n_x_local_int; int i_x_start_local_int; int n_y_transpose_local_int; int i_y_start_transpose_local_int; FFT->plan = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE); FFT->iplan = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE); rfftwnd_mpi_local_sizes(FFT->plan, &(n_x_local_int), &(i_x_start_local_int), &(n_y_transpose_local_int), &(i_y_start_transpose_local_int), &total_local_size_int); n_x_local = (ptrdiff_t)n_x_local_int; i_x_start_local = (ptrdiff_t)i_x_start_local_int; n_y_transpose_local = (ptrdiff_t)n_y_transpose_local_int; i_y_start_transpose_local = (ptrdiff_t)i_y_start_transpose_local_int; FFT->total_local_size = (size_t)total_local_size_int; #else FFT->total_local_size = 1; for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { if(i_d < FFT->n_d - 1) FFT->total_local_size *= FFT->n[i_d]; else FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1); } #if USE_DOUBLE FFT->plan = fftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); FFT->iplan = fftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #else FFT->plan = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); FFT->iplan = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); #endif #endif #else #if USE_MPI #if USE_DOUBLE fftw_mpi_init(); FFT->total_local_size = fftw_mpi_local_size_many_transposed(FFT->n_d, FFT->n, 1, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, SID_COMM_WORLD->comm, &(n_x_local), &(i_x_start_local), &(n_y_transpose_local), &(i_y_start_transpose_local)); FFT->plan = fftw_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); FFT->iplan = fftw_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); #else fftwf_mpi_init(); FFT->total_local_size = fftwf_mpi_local_size_many_transposed(FFT->n_d, FFT->n, 1, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, SID_COMM_WORLD->comm, &(n_x_local), &(i_x_start_local), &(n_y_transpose_local), &(i_y_start_transpose_local)); FFT->plan = fftwf_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); FFT->iplan = fftwf_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE); #endif #else FFT->total_local_size = 1; for(ptrdiff_t i_d=0; i_d < FFT->n_d; i_d++) { if(i_d < FFT->n_d - 1) FFT->total_local_size *= FFT->n[i_d]; else FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1); } #if USE_DOUBLE FFT->plan = fftw_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE); FFT->iplan = fftw_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE); #else FFT->plan = fftwf_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE); FFT->iplan = fftwf_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE); #endif #endif #endif SID_free(SID_FARG n_int); // Set empty slabs to start at 0 to make ignoring them simple. if(n_x_local == 0) i_x_start_local = 0; if(n_y_transpose_local == 0) i_y_start_transpose_local = 0; // Modify the local slab dimensions according to what FFTW chose. FFT->i_R_start_local[0] = i_x_start_local; FFT->n_R_local[0] = n_x_local; if(FFT->n_d > 1) { FFT->i_k_start_local[1] = i_y_start_transpose_local; FFT->n_k_local[1] = n_y_transpose_local; } // Allocate field #if USE_FFTW3 FFT->field_local = (gbpFFT_real *)fftwf_alloc_real(FFT->total_local_size); #else FFT->field_local = (gbpFFT_real *)SID_malloc(sizeof(gbpFFT_real)*FFT->total_local_size); #endif FFT->cfield_local = (gbpFFT_complex *)FFT->field_local; // Upper limits of slab decomposition for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->i_R_stop_local[i_d] = FFT->i_R_start_local[i_d] + FFT->n_R_local[i_d] - 1; FFT->i_k_stop_local[i_d] = FFT->i_k_start_local[i_d] + FFT->n_k_local[i_d] - 1; } // FFTW padding sizes if(FFT->n_d > 1) { FFT->pad_size_R = 2 * (FFT->n_R_local[FFT->n_d - 1] / 2 + 1) - FFT->n_R_local[FFT->n_d - 1]; FFT->pad_size_k = 0; } else { FFT->pad_size_R = 0; FFT->pad_size_k = 0; } // Number of elements (global and local) in the FFT ptrdiff_t i_d = 0; for(FFT->n_field = 1, FFT->n_field_R_local = 1, FFT->n_field_k_local = 1; i_d < FFT->n_d; i_d++) { FFT->n_field *= (size_t)FFT->n[i_d]; FFT->n_field_R_local *= (size_t)FFT->n_R_local[i_d]; FFT->n_field_k_local *= (size_t)FFT->n_k_local[i_d]; } // Clear the field clear_field(FFT); // Initialize the FFT's real-space grid FFT->R_field = (double **)SID_malloc(sizeof(double *) * FFT->n_d); FFT->dR = (double *)SID_malloc(sizeof(double *) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->R_field[i_d] = (double *)SID_malloc(sizeof(double) * (FFT->n[i_d] + 1)); FFT->dR[i_d] = FFT->L[i_d] / (double)(FFT->n[i_d]); for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++) FFT->R_field[i_d][i_i] = FFT->L[i_d] * ((double)i_i / (double)(FFT->n[i_d])); FFT->R_field[i_d][FFT->n[i_d]] = FFT->L[i_d]; } // Initialize the FFT's k-space grid FFT->k_field = (double **)SID_malloc(sizeof(double *) * FFT->n_d); FFT->dk = (double *)SID_malloc(sizeof(double *) * FFT->n_d); FFT->k_Nyquist = (double *)SID_malloc(sizeof(double *) * FFT->n_d); for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) { FFT->k_field[i_d] = (double *)SID_malloc(sizeof(double) * FFT->n[i_d]); FFT->dk[i_d] = TWO_PI / FFT->L[i_d]; FFT->k_Nyquist[i_d] = TWO_PI * (double)(FFT->n[i_d]) / FFT->L[i_d] / 2.; for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++) { if(i_i >= FFT->n[i_d] / 2) FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i - FFT->n[i_d]) / FFT->L[i_d]; else FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i) / FFT->L[i_d]; } } // Flags FFT->flag_padded = GBP_FALSE; // Slab info FFT->slab.n_x_local = FFT->n_R_local[0]; FFT->slab.i_x_start_local = FFT->i_R_start_local[0]; FFT->slab.i_x_stop_local = FFT->i_R_stop_local[0]; FFT->slab.x_min_local = FFT->R_field[0][FFT->i_R_start_local[0]]; if(FFT->slab.n_x_local > 0) FFT->slab.x_max_local = FFT->R_field[0][FFT->i_R_stop_local[0] + 1]; else FFT->slab.x_max_local = FFT->slab.x_min_local; SID_Allreduce(&(FFT->slab.x_max_local), &(FFT->slab.x_max), 1, SID_DOUBLE, SID_MAX, SID_COMM_WORLD); #if USE_MPI // All ranks are not necessarily assigned any slices, so // we need to figure out what ranks are to the right and the left for // buffer exchanges n_x_rank = (ptrdiff_t *)SID_malloc(sizeof(ptrdiff_t) * SID.n_proc); n_x_rank[SID.My_rank] = (ptrdiff_t)FFT->slab.n_x_local; if(n_x_rank[SID.My_rank] > 0) flag_active = GBP_TRUE; else flag_active = GBP_FALSE; SID_Allreduce(&flag_active, &n_active, 1, SID_INT, SID_SUM, SID_COMM_WORLD); SID_Allreduce(&n_x_rank[SID.My_rank], &min_size, 1, SID_INT, SID_MIN, SID_COMM_WORLD); SID_Allreduce(&n_x_rank[SID.My_rank], &max_size, 1, SID_INT, SID_MAX, SID_COMM_WORLD); for(int i_rank = 0; i_rank < SID.n_proc; i_rank++) SID_Bcast(&(n_x_rank[i_rank]), 1, SID_INT, i_rank, SID_COMM_WORLD); FFT->slab.rank_to_right = -1; for(int i_rank = SID.My_rank + 1; i_rank < SID.My_rank + SID.n_proc && FFT->slab.rank_to_right < 0; i_rank++) { int j_rank = i_rank % SID.n_proc; if(n_x_rank[j_rank] > 0) FFT->slab.rank_to_right = j_rank; } if(FFT->slab.rank_to_right < 0) FFT->slab.rank_to_right = SID.My_rank; FFT->slab.rank_to_left = -1; for(int i_rank = SID.My_rank - 1; i_rank > SID.My_rank - SID.n_proc && FFT->slab.rank_to_left < 0; i_rank--) { int j_rank = i_rank; if(i_rank < 0) j_rank = i_rank + SID.n_proc; if(n_x_rank[j_rank] > 0) FFT->slab.rank_to_left = j_rank; } if(FFT->slab.rank_to_left < 0) FFT->slab.rank_to_left = SID.My_rank; free(n_x_rank); SID_log("(%d cores unused, min/max slab size=%d/%d)...", SID_LOG_CONTINUE, SID.n_proc - n_active, min_size, max_size); #else FFT->slab.rank_to_right = SID.My_rank; FFT->slab.rank_to_left = SID.My_rank; if(FFT->slab.n_x_local > 0) { flag_active = GBP_TRUE; n_active = 1; min_size = FFT->slab.n_x_local; max_size = FFT->slab.n_x_local; } else { flag_active = GBP_FALSE; n_active = 0; min_size = 0; max_size = 0; } #endif SID_log("Done.", SID_LOG_CLOSE); }
int main(int argc, char **argv) { // Set up MPI // ========== ierr = MPI_Init(&argc, &argv); ierr = MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask); ierr = MPI_Comm_size(MPI_COMM_WORLD, &NTask); #ifdef SINGLE_PRECISION fftwf_mpi_init(); #else fftw_mpi_init(); #endif if(argc < 2) { if(ThisTask == 0) { fprintf(stdout, "Input parameters not found\n"); fprintf(stdout, "Call with <ParameterFile>\n"); } ierr = MPI_Finalize(); exit(0); } // Read the run parameters and setup code // ====================================== int stepDistr; int subtractLPT; double da=0; read_parameterfile(argv[1]); if (UseCOLA == 1){ subtractLPT = 1; stepDistr = 0; StdDA = 0; } else{ subtractLPT = 0; stepDistr = 1; StdDA = 2; } if (StdDA == 0){ fullT = 1; nLPT = -2.5; } filter = 0; // Whether or not to smooth the forces Scale = 2.*M_PI/Box; // The force smoothing scale if(ThisTask == 0) { printf("Run Parameters\n"); printf("==============\n"); printf("Cosmology:\n"); printf(" Omega Matter(z=0) = %lf\n",Omega); printf(" Omega Baryon(z=0) = %lf\n",OmegaBaryon); printf(" Hubble Parameter(z=0) = %lf\n",HubbleParam); printf(" Sigma8(z=0) = %lf\n",Sigma8); #ifndef GAUSSIAN printf(" F_nl = %lf\n",Fnl); #endif printf(" Primordial Index = %lf\n",PrimordialIndex); printf(" Initial Redshift = %lf\n",Init_Redshift); printf(" Final Redshift = %lf\n",Final_Redshift); #ifndef GAUSSIAN printf(" F_nl Redshift = %lf\n",Fnl_Redshift); #endif printf("Simulation:\n"); printf(" Nmesh = %d\n", Nmesh); printf(" Nsample = %d\n", Nsample); printf(" Boxsize = %lf\n", Box); printf(" Buffer Size = %lf\n", Buffer); switch(WhichSpectrum) { case 0: switch (WhichTransfer) { case 1: printf(" Using Eisenstein & Hu Transfer Function\n"); break; case 2: printf(" Using Tabulated Transfer Function\n"); break; default: printf(" Using Efstathiou Transfer Function\n"); break; } break; case 1: printf(" Using Eisenstein & Hu Power Spectrum\n"); break; case 2: printf(" Using Tabulated Power Spectrum\n"); break; default: printf(" Using Efstathiou Power Spectrum\n"); break; } printf(" Number of Timesteps = %d\n",nsteps); if (UseCOLA) { printf(" Using COLA method\n\n"); } else { printf(" Using Standard PM method\n\n"); } fflush(stdout); } // Initial and final scale factors: double ai=1.0/(1.0+Init_Redshift); double af=1.0/(1.0+Final_Redshift); if (stepDistr == 0) da=(af-ai)/((double)nsteps); if (stepDistr == 1) da=(log(af)-log(ai))/((double)nsteps); if (stepDistr == 2) da=(CosmoTime(af)-CosmoTime(ai))/((double)nsteps); set_units(); if (ThisTask == 0) { printf("Initialising Transfer Function/Power Spectrum\n"); printf("=============================================\n"); } initialize_transferfunction(); initialize_powerspectrum(); initialize_ffts(); initialize_parts(); if(ThisTask == 0) { printf("Creating initial conditions\n"); printf("===========================\n"); fflush(stdout); } // Create the calculate the Zeldovich and 2LPT displacements and create the initial conditions // =========================================================================================== int i, j, k, m; unsigned int n, coord; double A=ai; // This is the scale factor which we'll be advancing below. double Di=growthD(1.0, A); // initial growth factor double Di2=growthD2(A); // initial 2nd order growth factor double Dv=DprimeQ(A,1.0); // T[D_{za}]=dD_{za}/dy double Dv2=growthD2v(A); // T[D_{2lpt}]=dD_{2lpt}/dy displacement_fields(); P = (struct part_data *) malloc((int)(ceil(NumPart*Buffer))*sizeof(struct part_data)); // Generate the initial particle positions and velocities // If subtractLPT = 0 (non-COLA), then velocity is ds/dy, which is simply the 2LPT IC. // Else set vel = 0 if we subtract LPT. This is the same as the action of the operator L_- from TZE, as initial velocities are in 2LPT. for(i=0; i<Local_np; i++) { for (j=0; j<Nsample; j++) { for (k=0; k<Nsample; k++) { coord = (i * Nsample + j) * Nsample + k; P[coord].ID = ((i + Local_p_start) * Nsample + j) * Nsample + k; for (m=0; m<3; m++) { P[coord].Dz[m] = ZA[m][coord]; P[coord].D2[m] = LPT[m][coord]; if (subtractLPT == 0) { P[coord].Vel[m]=P[coord].Dz[m]*Dv+P[coord].D2[m]*Dv2; } else { P[coord].Vel[m] = 0.0; } } P[coord].Pos[0] = periodic_wrap((i+Local_p_start)*(Box/Nsample)+P[coord].Dz[0]*Di+P[coord].D2[0]*Di2); P[coord].Pos[1] = periodic_wrap(j*(Box/Nsample)+P[coord].Dz[1]*Di+P[coord].D2[1]*Di2); P[coord].Pos[2] = periodic_wrap(k*(Box/Nsample)+P[coord].Dz[2]*Di+P[coord].D2[2]*Di2); } } } for (i=0; i<3; i++) { free(ZA[i]); free(LPT[i]); } // Now, we get to the N-Body part where we evolve with time via the Kick-Drift-Kick Method // ======================================================================================= int timeStep; double AF=0,AI,AC,AFF=0; double growth1 = Di; double growth1L2 = Di2; // The density grid and force grids and associated fftw plans #ifndef MEMORY_MODE density = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); N11 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); N12 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); N13 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); P3D = (complex_kind*)density; FN11 = (complex_kind*)N11; FN12 = (complex_kind*)N12; FN13 = (complex_kind*)N13; #ifdef SINGLE_PRECISION plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE); p11 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE); p12 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE); p13 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE); #else plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE); p11 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE); p12 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE); p13 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE); #endif #endif if(ThisTask == 0) { printf("Beginning timestepping\n"); printf("======================\n"); fflush(stdout); } // AI stores the scale factor to which the velocities have been kicked to. Initially it's just A. AI=A; for (timeStep=0;timeStep<=nsteps;timeStep++){ // AFF is the scale factor to which we should drift the particle positions. // AF is the scale factor to which we should kick the particle velocities. if (stepDistr == 0) AFF=A+da; if (stepDistr == 1) AFF=A*exp(da); if (stepDistr == 2) AFF=AofTime(CosmoTime(A)+da); // half time-step for final kick if (timeStep == nsteps) { AF=A; } else { // Set to mid-point of interval. In the infinitesimal timestep limit, these choices are identical. // How one chooses the mid-point when not in that limit is really an extra degree of freedom in the code // but Tassev et al. report negligible effects from the different choices below. // Hence, this is not exported as an extra switch at this point. if (stepDistr == 0) AF=A+da*0.5; if (stepDistr == 1) AF=A*exp(da*0.5); if (stepDistr == 2) AF=AofTime((CosmoTime(AFF)+CosmoTime(A))*0.5); } if (ThisTask == 0) { printf("Iteration = %d\n------------------\n",timeStep+1); printf("a = %lf\n",A); printf("z = %lf\n",1.0/A-1.0); fflush(stdout); } // First we check whether all the particles are on the correct processor after the last time step/ // original 2LPT displacement and move them if not if (ThisTask == 0) printf("Moving particles across task boundaries...\n"); MoveParticles(); #ifdef MEMORY_MODE density = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); P3D = (complex_kind*)density; #ifdef SINGLE_PRECISION plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE); #else plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE); #endif #endif // Then we do the Cloud-in-Cell assignment to get the density grid and FFT it. if (ThisTask == 0) printf("Calculating density using Cloud-in-Cell...\n"); PtoMesh(); #ifdef MEMORY_MODE N11 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); N12 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); N13 = (float_kind *)calloc(2*Total_size,sizeof(float_kind)); FN11 = (complex_kind*)N11; FN12 = (complex_kind*)N12; FN13 = (complex_kind*)N13; #ifdef SINGLE_PRECISION p11 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE); p12 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE); p13 = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE); #else p11 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE); p12 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE); p13 = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE); #endif #endif // This returns N11,N12,N13 which hold the components of // the vector (grad grad^{-2} density) on a grid. if (ThisTask == 0) printf("Calculating forces...\n"); Forces(); #ifdef MEMORY_MODE free(density); for (i=0; i<3; i++) Disp[i] = (float *)malloc(NumPart*sizeof(float)); #ifdef SINGLE_PRECISION fftwf_destroy_plan(plan); #else fftw_destroy_plan(plan); #endif #else for (i=0; i<3; i++) Disp[i] = (float_kind *)malloc(NumPart*sizeof(float_kind)); #endif // Now find the accelerations at the particle positions using 3-linear interpolation. if (ThisTask == 0) printf("Calculating accelerations...\n"); MtoParticles(); #ifdef MEMORY_MODE free(N11); free(N12); free(N13); #ifdef SINGLE_PRECISION fftwf_destroy_plan(p11); fftwf_destroy_plan(p12); fftwf_destroy_plan(p13); #else fftw_destroy_plan(p11); fftw_destroy_plan(p12); fftw_destroy_plan(p13); #endif #endif // Calculate the mean displacement and subtract later. if (ThisTask == 0) printf("Calculating mean of displacements...\n"); double sumDx=0,sumDy=0,sumDz=0; for(n=0; n<NumPart; n++) { sumDx += Disp[0][n]; sumDy += Disp[1][n]; sumDz += Disp[2][n]; } // Make sumDx, sumDy and sumDz global averages ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); sumDx /= (double)TotNumPart; // We will subtract these below to conserve momentum. sumDy /= (double)TotNumPart; sumDz /= (double)TotNumPart; if (ThisTask == 0) { printf("Kicking the particles...\n"); fflush(stdout); } // Kick // =============== double dda; double q1,q2; double ax,ay,az; double sumx=0,sumy=0,sumz=0; double Om143=pow(Omega/(Omega+(1-Omega)*A*A*A),1./143.); if (StdDA == 0) { dda=Sphi(AI,AF,A); } else if (StdDA == 1) { dda=(AF-AI)*A/Qfactor(A); } else { dda=SphiStd(AI,AF); } q2=1.5*Omega*growth1*growth1*(1.0+7./3.*Om143)*A; // T^2[D_{2lpt}]=d^2 D_{2lpt}/dy^2 q1=1.5*Omega*growth1*A; // T^2[D_{ZA}]=d^2 D_{ZA}/dy^2 for(n=0; n<NumPart; n++) { Disp[0][n] -= sumDx; Disp[1][n] -= sumDy; Disp[2][n] -= sumDz; ax=-1.5*Omega*Disp[0][n]-subtractLPT*(P[n].Dz[0]*q1+P[n].D2[0]*q2)/A; ay=-1.5*Omega*Disp[1][n]-subtractLPT*(P[n].Dz[1]*q1+P[n].D2[1]*q2)/A; az=-1.5*Omega*Disp[2][n]-subtractLPT*(P[n].Dz[2]*q1+P[n].D2[2]*q2)/A; P[n].Vel[0] += ax*dda; P[n].Vel[1] += ay*dda; P[n].Vel[2] += az*dda; sumx += P[n].Vel[0]; sumy += P[n].Vel[1]; sumz += P[n].Vel[2]; } for (i=0; i<3; i++) free(Disp[i]); // Make sumx, sumy and sumz global averages ierr = MPI_Allreduce(MPI_IN_PLACE,&sumx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); ierr = MPI_Allreduce(MPI_IN_PLACE,&sumy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); ierr = MPI_Allreduce(MPI_IN_PLACE,&sumz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); sumx /= (double)TotNumPart; // We will subtract these below to conserve momentum. sumy /= (double)TotNumPart; // Should be conserved, but just in case 3-linear interpolation makes a problem. sumz /= (double)TotNumPart; // Never checked whether this makes a difference. if (timeStep == nsteps) { if (ThisTask == 0) { printf("Iteration %d finished\n------------------\n\n", timeStep+1); printf("Timestepping finished\n\n"); fflush(stdout); } // At final timestep, add back LPT velocities if we had subtracted them. // This corresponds to L_+ operator in TZE. Dv = DprimeQ(A,1.0); // dD_{za}/dy Dv2 = growthD2v(A); // dD_{2lpt}/dy for(n=0; n<NumPart; n++) { P[n].Vel[0] += -sumx+(P[n].Dz[0]*Dv+P[n].D2[0]*Dv2)*subtractLPT; P[n].Vel[1] += -sumy+(P[n].Dz[1]*Dv+P[n].D2[1]*Dv2)*subtractLPT; P[n].Vel[2] += -sumz+(P[n].Dz[2]*Dv+P[n].D2[2]*Dv2)*subtractLPT; } goto finalize; // Sorry for "goto" :) } if (ThisTask == 0) { printf("Drifting the particles...\n"); fflush(stdout); } // Drift // ============= double dyyy; double da1,da2; AC = AF; AF = AFF; if (StdDA == 0) { dyyy=Sq(A,AF,AC); } else if (StdDA == 1) { dyyy=(AF-A)/Qfactor(AC); } else { dyyy=SqStd(A,AF); } da1=growthD(1.0, AF)-growth1; // change in D da2=growthD2(AF)-growth1L2; // change in D_{2lpt} for(n=0; n<NumPart; n++) { P[n].Pos[0] += (P[n].Vel[0]-sumx)*dyyy; P[n].Pos[1] += (P[n].Vel[1]-sumy)*dyyy; P[n].Pos[2] += (P[n].Vel[2]-sumz)*dyyy; P[n].Pos[0] = periodic_wrap(P[n].Pos[0]+subtractLPT*(P[n].Dz[0]*da1+P[n].D2[0]*da2)); P[n].Pos[1] = periodic_wrap(P[n].Pos[1]+subtractLPT*(P[n].Dz[1]*da1+P[n].D2[1]*da2)); P[n].Pos[2] = periodic_wrap(P[n].Pos[2]+subtractLPT*(P[n].Dz[2]*da1+P[n].D2[2]*da2)); } // Step in time // ================ A = AF; // WRT to the above name change, A = AFF AI = AC; // WRT to the above name change, AI = AF growth1 = growthD(1.0, A); growth1L2 = growthD2(A); if (ThisTask == 0) { printf("Iteration %d finished\n------------------\n\n", timeStep+1); fflush(stdout); } ierr = MPI_Barrier(MPI_COMM_WORLD); } // Here is the last little bit // =========================== finalize: if (ThisTask == 0) { printf("Finishing up\n"); printf("============\n"); fflush(stdout); } // Now convert velocities to v_{rsd}\equiv (ds/d\eta)/(a H(a)) velRSD(A); // Output a slice just for the sake of doing something with P. if (ThisTask == 0) { printf("Converting to RSD velocities...\n"); printf("Outputting particles...\n"); } slice(); print_spec(); fflush(stdout); free_powertable(); free_transfertable(); #ifdef GENERIC_FNL free(KernelTable); #endif free(P); free(Slab_to_task); free(Part_to_task); free(Local_nx_table); free(Local_np_table); #ifndef MEMORY_MODE free(density); free(N11); free(N12); free(N13); #ifdef SINGLE_PRECISION fftwf_destroy_plan(plan); fftwf_destroy_plan(p11); fftwf_destroy_plan(p12); fftwf_destroy_plan(p13); #else fftw_destroy_plan(plan); fftw_destroy_plan(p11); fftw_destroy_plan(p12); fftw_destroy_plan(p13); #endif #endif #ifdef SINGLE_PRECISION fftwf_mpi_cleanup(); #else fftw_mpi_cleanup(); #endif if (ThisTask == 0) printf("Done :)\n"); MPI_Finalize(); return 0; }
int cfft2_init(int pad1 /* padding on the first axis */, int nx, int ny /* input data size */, int *nx2, int *ny2 /* padded data size */, int *n_local, int *o_local /* local size & start */) /*< initialize >*/ { int i, nth=1; int cpuid; ptrdiff_t n[2]; MPI_Comm_rank(MPI_COMM_WORLD, &cpuid); fftwf_mpi_init(); nk = n1 = kiss_fft_next_fast_size(nx*pad1); n2 = kiss_fft_next_fast_size(ny); n[0]=n2; n[1]=n1; //alloc_local = fftwf_mpi_local_size_many_transposed(2, n, 2, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start); alloc_local = fftwf_mpi_local_size_2d_transposed(n2, n1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start); cc = sf_complexalloc2(n1,local_n0); //cc = sf_complexalloc(alloc_local); /* kiss-fft */ #ifdef _OPENMP #pragma omp parallel {nth = omp_get_num_threads();} #endif cfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg)); icfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg)); cfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg)); icfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg)); for (i=0; i < nth; i++) { cfg1[i] = kiss_fft_alloc(n1,0,NULL,NULL); icfg1[i]= kiss_fft_alloc(n1,1,NULL,NULL); cfg2[i] = kiss_fft_alloc(n2,0,NULL,NULL); icfg2[i]= kiss_fft_alloc(n2,1,NULL,NULL); } ctrace2= (kiss_fft_cpx **) sf_complexalloc2(n2,nth); tmp = (kiss_fft_cpx *) sf_alloc(alloc_local,sizeof(kiss_fft_cpx)); tmp2= (sf_complex *) tmp; /* fftw for transpose */ cfg = fftwf_mpi_plan_many_transpose(n2,n1,2, FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, (float *) tmp, (float *) tmp, MPI_COMM_WORLD, FFTW_MEASURE); icfg= fftwf_mpi_plan_many_transpose(n1,n2,2, FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, (float *) tmp, (float *) tmp, MPI_COMM_WORLD, FFTW_MEASURE); if (NULL == cfg || NULL == icfg) sf_error("FFTW failure."); *nx2 = n1; *ny2 = n2; *n_local = (int) local_n0; *o_local = (int) local_0_start; wt = 1.0/(n1*n2); return (nk*n2); }