Exemplo n.º 1
0
int mcfft3_init(int pad1           /* padding on the first axis */,
	       int nx,   int ny,  int nz   /* input data size */, 
	       int *nx2, int *ny2, int *nz2 /* padded data size */,
               int *n_local, int *o_local /* local size & start */)
/*< initialize >*/
{
  int cpuid;
  MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);

  if (threads_ok) threads_ok = fftwf_init_threads();

  fftwf_mpi_init();

  if (false)
    sf_warning("Using threaded FFTW3! \n");
  if (threads_ok)
    fftwf_plan_with_nthreads(omp_get_max_threads());

  /* axis 1 */
  nk = n1 = kiss_fft_next_fast_size(nx*pad1);
  /* axis 2 */
  n2 = kiss_fft_next_fast_size(ny);
  /* axis 3 */
  n3 = kiss_fft_next_fast_size(nz);

  alloc_local = fftwf_mpi_local_size_3d(n3, n2, n1, MPI_COMM_WORLD, &local_n0, &local_0_start);

  //cc = sf_complexalloc3(n1,n2,n3);
  cc = sf_complexalloc(alloc_local);

  cfg  = fftwf_mpi_plan_dft_3d(n3,n2,n1,
                               (fftwf_complex *) cc,
                               (fftwf_complex *) cc,
                               MPI_COMM_WORLD,
                               FFTW_FORWARD, FFTW_MEASURE);

  icfg = fftwf_mpi_plan_dft_3d(n3,n2,n1,
                               (fftwf_complex *) cc, 
                               (fftwf_complex *) cc,
                               MPI_COMM_WORLD,
                               FFTW_BACKWARD, FFTW_MEASURE);

  if (NULL == cfg || NULL == icfg) sf_error("FFTW failure.");

  *nx2 = n1;
  *ny2 = n2;
  *nz2 = n3;
  *n_local = (int) local_n0;
  *o_local = (int) local_0_start;
	
  wt =  1.0/(n3*n2*n1);

  return (nk*n2*n3);
}
Exemplo n.º 2
0
int main(int argc, char **argv) {
	fftwf_plan plan;
	fftwf_complex *data;
	ptrdiff_t alloc_local, local_n0, local_0_start, i, j;

	if (argc != 2) { printf("usage: ./fft_mpi MATRIX_SIZE\n"); exit(1); }

	const ptrdiff_t N0 = atoi(argv[1]);
	const ptrdiff_t N1 = N0;
	int id;
	double startTime, totalTime;
	totalTime = 0;

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &id);
	fftwf_mpi_init();

	/* get local data size and allocate */
	alloc_local = fftwf_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD, &local_n0, &local_0_start);
	data = fftwf_alloc_complex(alloc_local);//(fftwf_complex *) fftwf_malloc(sizeof(fftw_complex) * alloc_local);

	/* create plan for in-place forward DFT */
	plan = fftwf_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD, FFTW_FORWARD, FFTW_ESTIMATE);

	/* initialize data to some function my_function(x,y) */
	for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j){
		data[i*N1 + j][0] = local_0_start;;//my_function(local_0_start + i, j);
		data[i*N1 + j][1]=i;
	}

	/* compute transforms, in-place, as many times as desired */
	
	MPI_Barrier(MPI_COMM_WORLD);
	if (id == 0) {
		startTime = getTime();
	}
	fftwf_execute(plan);
	MPI_Barrier(MPI_COMM_WORLD);
	if (id == 0) {
		totalTime += getTime() - startTime;
	}

	fftwf_destroy_plan(plan);
	fftwf_mpi_cleanup();

	if (id == 0) {
		printf("%.5f\n", totalTime);
	}

	MPI_Finalize();
	return 0;
}
Exemplo n.º 3
0
int cfft2_init(int pad1           /* padding on the first axis */,
	       int nx,   int ny   /* input data size */, 
	       int *nx2, int *ny2 /* padded data size */,
               int *n_local, int *o_local /* local size & start */,
               MPI_Comm comm)
/*< initialize >*/
{
  if (threads_ok) threads_ok = fftwf_init_threads();

  fftwf_mpi_init();

  if (false)
    sf_warning("Using threaded FFTW3! \n");
  if (threads_ok)
    fftwf_plan_with_nthreads(omp_get_max_threads());

  nk = n1 = kiss_fft_next_fast_size(nx*pad1);
  n2 = kiss_fft_next_fast_size(ny);

  alloc_local = fftwf_mpi_local_size_2d(n2, n1, comm, &local_n0, &local_0_start);

  //cc = sf_complexalloc2(n1,n2);
  //dd = sf_complexalloc2(nk,n2);
  cc = sf_complexalloc(alloc_local);
  dd = sf_complexalloc(alloc_local);

  cfg = fftwf_mpi_plan_dft_2d(n2,n1,
                              (fftwf_complex *) cc,
                              (fftwf_complex *) dd,
                              comm,
                              FFTW_FORWARD, FFTW_MEASURE);

  icfg = fftwf_mpi_plan_dft_2d(n2,n1,
                               (fftwf_complex *) dd, 
                               (fftwf_complex *) cc,
                               comm,
                               FFTW_BACKWARD, FFTW_MEASURE);

  if (NULL == cfg || NULL == icfg) sf_error("FFTW failure.");

  *nx2 = n1;
  *ny2 = n2;
  *n_local = (int) local_n0;
  *o_local = (int) local_0_start;
	
  wt =  1.0/(n1*n2);
	
  return (nk*n2);
}
Exemplo n.º 4
0
void init_field(int n_d, int *n, double *L, field_info *FFT) {
    ptrdiff_t  n_x_local;
    ptrdiff_t  i_x_start_local;
    ptrdiff_t  n_y_transpose_local;
    ptrdiff_t  i_y_start_transpose_local;
    ptrdiff_t *n_x_rank;

    int  flag_active;
    int  n_active;
    int  min_size, max_size;

    SID_log("Initializing ", SID_LOG_OPEN);
    for(ptrdiff_t i_d = 0; i_d < n_d; i_d++) {
        if(i_d < (n_d - 1))
            SID_log("%dx", SID_LOG_CONTINUE, n[i_d]);
        else
            SID_log("%d element %d-d FFT ", SID_LOG_CONTINUE, n[i_d], n_d);
    }
    SID_log("(%d byte precision)...", SID_LOG_CONTINUE, (int)sizeof(GBPREAL));

    // Initialize FFT sizes
    FFT->n_d             = n_d;
    FFT->n               = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->L               = (double *)SID_calloc(sizeof(double) * FFT->n_d);
    FFT->n_k_local       = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->n_R_local       = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->i_R_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->i_k_start_local = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->i_R_stop_local  = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    FFT->i_k_stop_local  = (ptrdiff_t *)SID_calloc(sizeof(ptrdiff_t) * FFT->n_d);
    for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) {
        FFT->n[i_d]               = n[i_d];
        FFT->L[i_d]               = L[i_d];
        FFT->i_R_start_local[i_d] = 0;
        FFT->i_k_start_local[i_d] = 0;
        FFT->n_R_local[i_d]       = FFT->n[i_d];
        FFT->n_k_local[i_d]       = FFT->n[i_d];
    }
    FFT->n_k_local[FFT->n_d - 1] = FFT->n[FFT->n_d - 1] / 2 + 1;

    // Initialize FFTW

    // Create an integer version of FFT->n[] to pass to ..._create_plan
    int *n_int=(int *)SID_malloc(sizeof(int)*FFT->n_d);
    for(int i_d=0;i_d<FFT->n_d;i_d++)
        n_int[i_d]=(int)FFT->n[i_d];
#if FFTW_V2
#if USE_MPI
    int total_local_size_int;
    int n_x_local_int;
    int i_x_start_local_int;
    int n_y_transpose_local_int;
    int i_y_start_transpose_local_int;
    FFT->plan  = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE);
    FFT->iplan = rfftwnd_mpi_create_plan(SID.COMM_WORLD->comm, FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE);
    rfftwnd_mpi_local_sizes(FFT->plan,
                            &(n_x_local_int),
                            &(i_x_start_local_int),
                            &(n_y_transpose_local_int),
                            &(i_y_start_transpose_local_int),
                            &total_local_size_int);
    n_x_local =  (ptrdiff_t)n_x_local_int;
    i_x_start_local = (ptrdiff_t)i_x_start_local_int;
    n_y_transpose_local = (ptrdiff_t)n_y_transpose_local_int;
    i_y_start_transpose_local = (ptrdiff_t)i_y_start_transpose_local_int;
    FFT->total_local_size = (size_t)total_local_size_int;
#else
    FFT->total_local_size = 1;
    for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) {
        if(i_d < FFT->n_d - 1)
            FFT->total_local_size *= FFT->n[i_d];
        else
            FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1);
    }
#if USE_DOUBLE
    FFT->plan  = fftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
    FFT->iplan = fftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#else
    FFT->plan  = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
    FFT->iplan = rfftwnd_create_plan(FFT->n_d, n_int, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#endif
#else
#if USE_MPI
#if USE_DOUBLE
    fftw_mpi_init();
    FFT->total_local_size = fftw_mpi_local_size_many_transposed(FFT->n_d,
                                                                FFT->n,
                                                                1,
                                                                FFTW_MPI_DEFAULT_BLOCK,
                                                                FFTW_MPI_DEFAULT_BLOCK,
                                                                SID_COMM_WORLD->comm,
                                                                &(n_x_local),
                                                                &(i_x_start_local),
                                                                &(n_y_transpose_local),
                                                                &(i_y_start_transpose_local));
    FFT->plan  = fftw_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE);
    FFT->iplan = fftw_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE);
#else
    fftwf_mpi_init();
    FFT->total_local_size   = fftwf_mpi_local_size_many_transposed(FFT->n_d,
                                                                 FFT->n,
                                                                 1,
                                                                 FFTW_MPI_DEFAULT_BLOCK,
                                                                 FFTW_MPI_DEFAULT_BLOCK,
                                                                 SID_COMM_WORLD->comm,
                                                                 &(n_x_local),
                                                                 &(i_x_start_local),
                                                                 &(n_y_transpose_local),
                                                                 &(i_y_start_transpose_local));
    FFT->plan  = fftwf_mpi_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE);
    FFT->iplan = fftwf_mpi_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, SID_COMM_WORLD->comm, FFTW_ESTIMATE);
#endif
#else
    FFT->total_local_size = 1;
    for(ptrdiff_t i_d=0; i_d < FFT->n_d; i_d++) {
        if(i_d < FFT->n_d - 1)
            FFT->total_local_size *= FFT->n[i_d];
        else
            FFT->total_local_size *= 2 * (FFT->n[i_d] / 2 + 1);
    }
#if USE_DOUBLE
    FFT->plan  = fftw_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE);
    FFT->iplan = fftw_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE);
#else
    FFT->plan  = fftwf_plan_dft_r2c(FFT->n_d, FFT->n, FFT->field_local, FFT->cfield_local, FFTW_ESTIMATE);
    FFT->iplan = fftwf_plan_dft_c2r(FFT->n_d, FFT->n, FFT->cfield_local, FFT->field_local, FFTW_ESTIMATE);
#endif
#endif
#endif

    SID_free(SID_FARG n_int);


    // Set empty slabs to start at 0 to make ignoring them simple.
    if(n_x_local == 0)
        i_x_start_local = 0;
    if(n_y_transpose_local == 0)
        i_y_start_transpose_local = 0;

    // Modify the local slab dimensions according to what FFTW chose.
    FFT->i_R_start_local[0] = i_x_start_local;
    FFT->n_R_local[0]       = n_x_local;
    if(FFT->n_d > 1) {
        FFT->i_k_start_local[1] = i_y_start_transpose_local;
        FFT->n_k_local[1]       = n_y_transpose_local;
    }

    // Allocate field
#if USE_FFTW3
    FFT->field_local  = (gbpFFT_real    *)fftwf_alloc_real(FFT->total_local_size);
#else
    FFT->field_local  = (gbpFFT_real    *)SID_malloc(sizeof(gbpFFT_real)*FFT->total_local_size);
#endif
    FFT->cfield_local = (gbpFFT_complex *)FFT->field_local;

    // Upper limits of slab decomposition
    for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) {
        FFT->i_R_stop_local[i_d] = FFT->i_R_start_local[i_d] + FFT->n_R_local[i_d] - 1;
        FFT->i_k_stop_local[i_d] = FFT->i_k_start_local[i_d] + FFT->n_k_local[i_d] - 1;
    }

    // FFTW padding sizes
    if(FFT->n_d > 1) {
        FFT->pad_size_R = 2 * (FFT->n_R_local[FFT->n_d - 1] / 2 + 1) - FFT->n_R_local[FFT->n_d - 1];
        FFT->pad_size_k = 0;
    } else {
        FFT->pad_size_R = 0;
        FFT->pad_size_k = 0;
    }

    // Number of elements (global and local) in the FFT
    ptrdiff_t i_d = 0;
    for(FFT->n_field = 1, FFT->n_field_R_local = 1, FFT->n_field_k_local = 1; i_d < FFT->n_d; i_d++) {
        FFT->n_field *= (size_t)FFT->n[i_d];
        FFT->n_field_R_local *= (size_t)FFT->n_R_local[i_d];
        FFT->n_field_k_local *= (size_t)FFT->n_k_local[i_d];
    }

    // Clear the field
    clear_field(FFT);

    // Initialize the FFT's real-space grid
    FFT->R_field = (double **)SID_malloc(sizeof(double *) * FFT->n_d);
    FFT->dR      = (double *)SID_malloc(sizeof(double *) * FFT->n_d);
    for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) {
        FFT->R_field[i_d] = (double *)SID_malloc(sizeof(double) * (FFT->n[i_d] + 1));
        FFT->dR[i_d]      = FFT->L[i_d] / (double)(FFT->n[i_d]);
        for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++)
            FFT->R_field[i_d][i_i] = FFT->L[i_d] * ((double)i_i / (double)(FFT->n[i_d]));
        FFT->R_field[i_d][FFT->n[i_d]] = FFT->L[i_d];
    }

    // Initialize the FFT's k-space grid
    FFT->k_field   = (double **)SID_malloc(sizeof(double *) * FFT->n_d);
    FFT->dk        = (double *)SID_malloc(sizeof(double *) * FFT->n_d);
    FFT->k_Nyquist = (double *)SID_malloc(sizeof(double *) * FFT->n_d);
    for(ptrdiff_t i_d = 0; i_d < FFT->n_d; i_d++) {
        FFT->k_field[i_d]   = (double *)SID_malloc(sizeof(double) * FFT->n[i_d]);
        FFT->dk[i_d]        = TWO_PI / FFT->L[i_d];
        FFT->k_Nyquist[i_d] = TWO_PI * (double)(FFT->n[i_d]) / FFT->L[i_d] / 2.;
        for(ptrdiff_t i_i = 0; i_i < FFT->n[i_d]; i_i++) {
            if(i_i >= FFT->n[i_d] / 2)
                FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i - FFT->n[i_d]) / FFT->L[i_d];
            else
                FFT->k_field[i_d][i_i] = TWO_PI * (double)(i_i) / FFT->L[i_d];
        }
    }

    // Flags
    FFT->flag_padded = GBP_FALSE;

    // Slab info
    FFT->slab.n_x_local       = FFT->n_R_local[0];
    FFT->slab.i_x_start_local = FFT->i_R_start_local[0];
    FFT->slab.i_x_stop_local  = FFT->i_R_stop_local[0];
    FFT->slab.x_min_local     = FFT->R_field[0][FFT->i_R_start_local[0]];
    if(FFT->slab.n_x_local > 0)
        FFT->slab.x_max_local = FFT->R_field[0][FFT->i_R_stop_local[0] + 1];
    else
        FFT->slab.x_max_local = FFT->slab.x_min_local;
    SID_Allreduce(&(FFT->slab.x_max_local), &(FFT->slab.x_max), 1, SID_DOUBLE, SID_MAX, SID_COMM_WORLD);

#if USE_MPI
    // All ranks are not necessarily assigned any slices, so
    //   we need to figure out what ranks are to the right and the left for
    //   buffer exchanges
    n_x_rank              = (ptrdiff_t *)SID_malloc(sizeof(ptrdiff_t) * SID.n_proc);
    n_x_rank[SID.My_rank] = (ptrdiff_t)FFT->slab.n_x_local;
    if(n_x_rank[SID.My_rank] > 0)
        flag_active = GBP_TRUE;
    else
        flag_active = GBP_FALSE;
    SID_Allreduce(&flag_active, &n_active, 1, SID_INT, SID_SUM, SID_COMM_WORLD);
    SID_Allreduce(&n_x_rank[SID.My_rank], &min_size, 1, SID_INT, SID_MIN, SID_COMM_WORLD);
    SID_Allreduce(&n_x_rank[SID.My_rank], &max_size, 1, SID_INT, SID_MAX, SID_COMM_WORLD);
    for(int i_rank = 0; i_rank < SID.n_proc; i_rank++)
        SID_Bcast(&(n_x_rank[i_rank]), 1, SID_INT, i_rank, SID_COMM_WORLD);
    FFT->slab.rank_to_right = -1;
    for(int i_rank = SID.My_rank + 1; i_rank < SID.My_rank + SID.n_proc && FFT->slab.rank_to_right < 0; i_rank++) {
        int j_rank = i_rank % SID.n_proc;
        if(n_x_rank[j_rank] > 0)
            FFT->slab.rank_to_right = j_rank;
    }
    if(FFT->slab.rank_to_right < 0)
        FFT->slab.rank_to_right = SID.My_rank;
    FFT->slab.rank_to_left = -1;
    for(int i_rank = SID.My_rank - 1; i_rank > SID.My_rank - SID.n_proc && FFT->slab.rank_to_left < 0; i_rank--) {
        int j_rank = i_rank;
        if(i_rank < 0)
            j_rank = i_rank + SID.n_proc;
        if(n_x_rank[j_rank] > 0)
            FFT->slab.rank_to_left = j_rank;
    }
    if(FFT->slab.rank_to_left < 0)
        FFT->slab.rank_to_left = SID.My_rank;
    free(n_x_rank);
    SID_log("(%d cores unused, min/max slab size=%d/%d)...", SID_LOG_CONTINUE, SID.n_proc - n_active, min_size, max_size);
#else
    FFT->slab.rank_to_right = SID.My_rank;
    FFT->slab.rank_to_left  = SID.My_rank;
    if(FFT->slab.n_x_local > 0) {
        flag_active = GBP_TRUE;
        n_active    = 1;
        min_size    = FFT->slab.n_x_local;
        max_size    = FFT->slab.n_x_local;
    } else {
        flag_active = GBP_FALSE;
        n_active    = 0;
        min_size    = 0;
        max_size    = 0;
    }
#endif

    SID_log("Done.", SID_LOG_CLOSE);
}
Exemplo n.º 5
0
int main(int argc, char **argv) {
   
  // Set up MPI
  // ==========
  ierr = MPI_Init(&argc, &argv);
  ierr = MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
  ierr = MPI_Comm_size(MPI_COMM_WORLD, &NTask);
#ifdef SINGLE_PRECISION
  fftwf_mpi_init();
#else
  fftw_mpi_init();
#endif

  if(argc < 2) {
    if(ThisTask == 0) {
      fprintf(stdout, "Input parameters not found\n");
      fprintf(stdout, "Call with <ParameterFile>\n");
    }
    ierr = MPI_Finalize();
    exit(0);
  }
   
  // Read the run parameters and setup code
  // ======================================
  int stepDistr;   
  int subtractLPT;
  double da=0;

  read_parameterfile(argv[1]);
  
  if (UseCOLA == 1){
    subtractLPT = 1; 
    stepDistr   = 0;
    StdDA       = 0;
  } else{
    subtractLPT = 0; 
    stepDistr   = 1;
    StdDA       = 2;
  }
  if (StdDA == 0){
    fullT = 1;
    nLPT  = -2.5;
  }
  filter = 0;              // Whether or not to smooth the forces
  Scale  = 2.*M_PI/Box;    // The force smoothing scale 

  if(ThisTask == 0) {
    printf("Run Parameters\n");
    printf("==============\n");
    printf("Cosmology:\n");
    printf("  Omega Matter(z=0) = %lf\n",Omega);
    printf("  Omega Baryon(z=0) = %lf\n",OmegaBaryon);
    printf("  Hubble Parameter(z=0) = %lf\n",HubbleParam);
    printf("  Sigma8(z=0) = %lf\n",Sigma8);
#ifndef GAUSSIAN
    printf("  F_nl = %lf\n",Fnl);
#endif
    printf("  Primordial Index = %lf\n",PrimordialIndex);
    printf("  Initial Redshift  = %lf\n",Init_Redshift);
    printf("  Final Redshift    = %lf\n",Final_Redshift);
#ifndef GAUSSIAN
    printf("  F_nl Redshift  = %lf\n",Fnl_Redshift);
#endif
    printf("Simulation:\n");
    printf("  Nmesh = %d\n", Nmesh);
    printf("  Nsample = %d\n", Nsample);
    printf("  Boxsize = %lf\n", Box);
    printf("  Buffer Size = %lf\n", Buffer);
    switch(WhichSpectrum) {
      case 0:
        switch (WhichTransfer) {
          case 1:
            printf("  Using Eisenstein & Hu Transfer Function\n");
            break;
          case 2:
            printf("  Using Tabulated Transfer Function\n");
            break;
          default:
            printf("  Using Efstathiou Transfer Function\n");
            break;
        }
        break;
      case 1:
        printf("  Using Eisenstein & Hu Power Spectrum\n");
        break;
      case 2:
        printf("  Using Tabulated Power Spectrum\n");
        break;   
      default:
        printf("  Using Efstathiou Power Spectrum\n");
        break;
    }      
    printf("  Number of Timesteps = %d\n",nsteps);
    if (UseCOLA) {
      printf("  Using COLA method\n\n");
    } else {
      printf("  Using Standard PM method\n\n");
    }
    fflush(stdout);
  }   
  
  // Initial and final scale factors:
  double ai=1.0/(1.0+Init_Redshift);
  double af=1.0/(1.0+Final_Redshift);
    
  if (stepDistr == 0) da=(af-ai)/((double)nsteps);
  if (stepDistr == 1) da=(log(af)-log(ai))/((double)nsteps);
  if (stepDistr == 2) da=(CosmoTime(af)-CosmoTime(ai))/((double)nsteps);

  set_units();

  if (ThisTask == 0) {
    printf("Initialising Transfer Function/Power Spectrum\n");
    printf("=============================================\n");
  }
  initialize_transferfunction();
  initialize_powerspectrum();
  initialize_ffts();
  initialize_parts();

  if(ThisTask == 0) {
    printf("Creating initial conditions\n");
    printf("===========================\n");
    fflush(stdout);
  }

  // Create the calculate the Zeldovich and 2LPT displacements and create the initial conditions
  // ===========================================================================================
  int i, j, k, m;
  unsigned int n, coord;
  double A=ai;                // This is the scale factor which we'll be advancing below.
  double Di=growthD(1.0, A);  // initial growth factor
  double Di2=growthD2(A);     // initial 2nd order growth factor  
  double Dv=DprimeQ(A,1.0);   // T[D_{za}]=dD_{za}/dy
  double Dv2=growthD2v(A);    // T[D_{2lpt}]=dD_{2lpt}/dy

  displacement_fields();
    
  P = (struct part_data *) malloc((int)(ceil(NumPart*Buffer))*sizeof(struct part_data));

  // Generate the initial particle positions and velocities
  // If subtractLPT = 0 (non-COLA), then velocity is ds/dy, which is simply the 2LPT IC.
  // Else set vel = 0 if we subtract LPT. This is the same as the action of the operator L_- from TZE, as initial velocities are in 2LPT.
  for(i=0; i<Local_np; i++) {
    for (j=0; j<Nsample; j++) {
      for (k=0; k<Nsample; k++) {
        coord = (i * Nsample + j) * Nsample + k;
           
        P[coord].ID = ((i + Local_p_start) * Nsample + j) * Nsample + k;
        for (m=0; m<3; m++) {
          P[coord].Dz[m] = ZA[m][coord];
          P[coord].D2[m] = LPT[m][coord];
          if (subtractLPT == 0) {
            P[coord].Vel[m]=P[coord].Dz[m]*Dv+P[coord].D2[m]*Dv2;
          } else {
            P[coord].Vel[m] = 0.0;
          }
        }

        P[coord].Pos[0] = periodic_wrap((i+Local_p_start)*(Box/Nsample)+P[coord].Dz[0]*Di+P[coord].D2[0]*Di2);
        P[coord].Pos[1] = periodic_wrap(j*(Box/Nsample)+P[coord].Dz[1]*Di+P[coord].D2[1]*Di2);
        P[coord].Pos[2] = periodic_wrap(k*(Box/Nsample)+P[coord].Dz[2]*Di+P[coord].D2[2]*Di2);
      }
    }
  }

  for (i=0; i<3; i++) {
    free(ZA[i]);
    free(LPT[i]);
  }

  // Now, we get to the N-Body part where we evolve with time via the Kick-Drift-Kick Method
  // =======================================================================================
  int timeStep;
  double AF=0,AI,AC,AFF=0;
  double growth1   = Di;
  double growth1L2 = Di2;

  // The density grid and force grids  and associated fftw plans
#ifndef MEMORY_MODE
  density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  P3D  = (complex_kind*)density;
  FN11 = (complex_kind*)N11;
  FN12 = (complex_kind*)N12;
  FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
  plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
  plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

 if(ThisTask == 0) {
    printf("Beginning timestepping\n");
    printf("======================\n");
    fflush(stdout);
  }
  
  // AI stores the scale factor to which the velocities have been kicked to. Initially it's just A.
  AI=A;
  for (timeStep=0;timeStep<=nsteps;timeStep++){
    
    // AFF is the scale factor to which we should drift the particle positions.
    // AF is the scale factor to which we should kick the particle velocities.
    if (stepDistr == 0) AFF=A+da;
    if (stepDistr == 1) AFF=A*exp(da);
    if (stepDistr == 2) AFF=AofTime(CosmoTime(A)+da);

    // half time-step for final kick
    if (timeStep == nsteps) {
      AF=A; 
    } else { 
      // Set to mid-point of interval. In the infinitesimal timestep limit, these choices are identical. 
      // How one chooses the mid-point when not in that limit is really an extra degree of freedom in the code 
      // but Tassev et al. report negligible effects from the different choices below. 
      // Hence, this is not exported as an extra switch at this point.
      if (stepDistr == 0) AF=A+da*0.5;
      if (stepDistr == 1) AF=A*exp(da*0.5);
      if (stepDistr == 2) AF=AofTime((CosmoTime(AFF)+CosmoTime(A))*0.5); 
    }
    
    if (ThisTask == 0) {
      printf("Iteration = %d\n------------------\n",timeStep+1);
      printf("a = %lf\n",A);
      printf("z = %lf\n",1.0/A-1.0);
      fflush(stdout);
    }

    // First we check whether all the particles are on the correct processor after the last time step/
    // original 2LPT displacement and move them if not
    if (ThisTask == 0) printf("Moving particles across task boundaries...\n");
    MoveParticles();

#ifdef MEMORY_MODE
    density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    P3D  = (complex_kind*)density;
#ifdef SINGLE_PRECISION
    plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

    // Then we do the Cloud-in-Cell assignment to get the density grid and FFT it.  
    if (ThisTask == 0) printf("Calculating density using Cloud-in-Cell...\n");
    PtoMesh();

#ifdef MEMORY_MODE
    N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    FN11 = (complex_kind*)N11;
    FN12 = (complex_kind*)N12;
    FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
    p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif
    
    // This returns N11,N12,N13 which hold the components of
    // the vector (grad grad^{-2} density) on a grid.
    if (ThisTask == 0) printf("Calculating forces...\n");
    Forces(); 

#ifdef MEMORY_MODE
    free(density);
    for (i=0; i<3; i++) Disp[i] = (float *)malloc(NumPart*sizeof(float));
#ifdef SINGLE_PRECISION
    fftwf_destroy_plan(plan);
#else 
    fftw_destroy_plan(plan);
#endif
#else
    for (i=0; i<3; i++) Disp[i] = (float_kind *)malloc(NumPart*sizeof(float_kind));
#endif
    
    // Now find the accelerations at the particle positions using 3-linear interpolation. 
    if (ThisTask == 0) printf("Calculating accelerations...\n");
    MtoParticles();

#ifdef MEMORY_MODE
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif
    
    // Calculate the mean displacement and subtract later.
    if (ThisTask == 0) printf("Calculating mean of displacements...\n");
    double sumDx=0,sumDy=0,sumDz=0;
    for(n=0; n<NumPart; n++) {
      sumDx += Disp[0][n];
      sumDy += Disp[1][n];
      sumDz += Disp[2][n];
    }

    // Make sumDx, sumDy and sumDz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumDx /= (double)TotNumPart; // We will subtract these below to conserve momentum. 
    sumDy /= (double)TotNumPart;
    sumDz /= (double)TotNumPart; 

    if (ThisTask == 0) {
      printf("Kicking the particles...\n");
      fflush(stdout);
    }

    // Kick
    // ===============
    double dda;
    double q1,q2;
    double ax,ay,az;
    double sumx=0,sumy=0,sumz=0; 
    double Om143=pow(Omega/(Omega+(1-Omega)*A*A*A),1./143.);
    
    if (StdDA == 0) {
      dda=Sphi(AI,AF,A);
    } else if (StdDA == 1) {
      dda=(AF-AI)*A/Qfactor(A);
    } else {
      dda=SphiStd(AI,AF);
    }  
    
    q2=1.5*Omega*growth1*growth1*(1.0+7./3.*Om143)*A; // T^2[D_{2lpt}]=d^2 D_{2lpt}/dy^2
    q1=1.5*Omega*growth1*A;                           // T^2[D_{ZA}]=d^2 D_{ZA}/dy^2
    
    for(n=0; n<NumPart; n++) {

      Disp[0][n] -= sumDx;
      Disp[1][n] -= sumDy;
      Disp[2][n] -= sumDz;

      ax=-1.5*Omega*Disp[0][n]-subtractLPT*(P[n].Dz[0]*q1+P[n].D2[0]*q2)/A;
      ay=-1.5*Omega*Disp[1][n]-subtractLPT*(P[n].Dz[1]*q1+P[n].D2[1]*q2)/A;
      az=-1.5*Omega*Disp[2][n]-subtractLPT*(P[n].Dz[2]*q1+P[n].D2[2]*q2)/A;

      P[n].Vel[0] += ax*dda;
      P[n].Vel[1] += ay*dda;
      P[n].Vel[2] += az*dda;

      sumx += P[n].Vel[0];
      sumy += P[n].Vel[1];
      sumz += P[n].Vel[2];
    }

    for (i=0; i<3; i++) free(Disp[i]);

    // Make sumx, sumy and sumz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumx /= (double)TotNumPart;  // We will subtract these below to conserve momentum. 
    sumy /= (double)TotNumPart;  // Should be conserved, but just in case 3-linear interpolation makes a problem.
    sumz /= (double)TotNumPart;  // Never checked whether this makes a difference.

    if (timeStep == nsteps) {

      if (ThisTask == 0) {
        printf("Iteration %d finished\n------------------\n\n", timeStep+1);
        printf("Timestepping finished\n\n");
        fflush(stdout);
      }
    
      // At final timestep, add back LPT velocities if we had subtracted them. 
      // This corresponds to L_+ operator in TZE.
      Dv  = DprimeQ(A,1.0);  // dD_{za}/dy
      Dv2 = growthD2v(A);    // dD_{2lpt}/dy

      for(n=0; n<NumPart; n++) {
        P[n].Vel[0] += -sumx+(P[n].Dz[0]*Dv+P[n].D2[0]*Dv2)*subtractLPT;
        P[n].Vel[1] += -sumy+(P[n].Dz[1]*Dv+P[n].D2[1]*Dv2)*subtractLPT;
        P[n].Vel[2] += -sumz+(P[n].Dz[2]*Dv+P[n].D2[2]*Dv2)*subtractLPT;
      }

      goto finalize; // Sorry for "goto" :)
    }
    
    if (ThisTask == 0) {
      printf("Drifting the particles...\n");
      fflush(stdout);
    }

    // Drift
    // =============
    double dyyy;
    double da1,da2;

    AC = AF;
    AF = AFF;
    
    if (StdDA == 0) {
      dyyy=Sq(A,AF,AC);
    } else if (StdDA == 1) {
      dyyy=(AF-A)/Qfactor(AC);
    } else {
      dyyy=SqStd(A,AF);
    }

    da1=growthD(1.0, AF)-growth1;    // change in D
    da2=growthD2(AF)-growth1L2; // change in D_{2lpt}
    
    for(n=0; n<NumPart; n++) {
        P[n].Pos[0] += (P[n].Vel[0]-sumx)*dyyy;
        P[n].Pos[1] += (P[n].Vel[1]-sumy)*dyyy;
        P[n].Pos[2] += (P[n].Vel[2]-sumz)*dyyy;

        P[n].Pos[0] = periodic_wrap(P[n].Pos[0]+subtractLPT*(P[n].Dz[0]*da1+P[n].D2[0]*da2));
        P[n].Pos[1] = periodic_wrap(P[n].Pos[1]+subtractLPT*(P[n].Dz[1]*da1+P[n].D2[1]*da2));
        P[n].Pos[2] = periodic_wrap(P[n].Pos[2]+subtractLPT*(P[n].Dz[2]*da1+P[n].D2[2]*da2));
    }

    // Step in time
    // ================
    A  = AF;   // WRT to the above name change, A  = AFF
    AI = AC;   // WRT to the above name change, AI = AF

    growth1   = growthD(1.0, A);
    growth1L2 = growthD2(A);

    if (ThisTask == 0) {
      printf("Iteration %d finished\n------------------\n\n", timeStep+1);
      fflush(stdout);
    }
     
    ierr = MPI_Barrier(MPI_COMM_WORLD);

  }

  // Here is the last little bit
  // ===========================
  finalize:

  if (ThisTask == 0) {
    printf("Finishing up\n");
    printf("============\n");
    fflush(stdout);
  }
    
  // Now convert velocities to v_{rsd}\equiv (ds/d\eta)/(a H(a))
  velRSD(A);
    
  // Output a slice just for the sake of doing something with P.
  if (ThisTask == 0) {
    printf("Converting to RSD velocities...\n");
    printf("Outputting particles...\n"); 
  }
  slice();
  print_spec();
  fflush(stdout);

  free_powertable();
  free_transfertable();
#ifdef GENERIC_FNL
  free(KernelTable);
#endif

  free(P);
  free(Slab_to_task);
  free(Part_to_task);
  free(Local_nx_table);
  free(Local_np_table);
#ifndef MEMORY_MODE
  free(density);
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(plan);
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(plan);
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif

#ifdef SINGLE_PRECISION
  fftwf_mpi_cleanup();
#else
  fftw_mpi_cleanup();
#endif

  if (ThisTask == 0) printf("Done :)\n");

  MPI_Finalize();   

  return 0;
}
Exemplo n.º 6
0
int cfft2_init(int pad1           /* padding on the first axis */,
	       int nx,   int ny   /* input data size */, 
	       int *nx2, int *ny2 /* padded data size */,
               int *n_local, int *o_local /* local size & start */)
/*< initialize >*/
{
  int i, nth=1;
  int cpuid;
  ptrdiff_t n[2];
  
  MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);

  fftwf_mpi_init();

  nk = n1 = kiss_fft_next_fast_size(nx*pad1);
  n2 = kiss_fft_next_fast_size(ny);

  n[0]=n2; n[1]=n1;
  //alloc_local = fftwf_mpi_local_size_many_transposed(2, n, 2, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);
  alloc_local = fftwf_mpi_local_size_2d_transposed(n2, n1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);

  cc = sf_complexalloc2(n1,local_n0);
  //cc = sf_complexalloc(alloc_local);

  /* kiss-fft */

#ifdef _OPENMP
#pragma omp parallel
  {nth = omp_get_num_threads();}
#endif

  cfg1  = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  icfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  cfg2  = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  icfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));

  for (i=0; i < nth; i++) {
    cfg1[i] = kiss_fft_alloc(n1,0,NULL,NULL);
    icfg1[i]= kiss_fft_alloc(n1,1,NULL,NULL);
    cfg2[i] = kiss_fft_alloc(n2,0,NULL,NULL);
    icfg2[i]= kiss_fft_alloc(n2,1,NULL,NULL);
  }

  ctrace2= (kiss_fft_cpx **) sf_complexalloc2(n2,nth);

  tmp =    (kiss_fft_cpx *) sf_alloc(alloc_local,sizeof(kiss_fft_cpx));
  tmp2= (sf_complex *) tmp;

  /* fftw for transpose */

  cfg = fftwf_mpi_plan_many_transpose(n2,n1,2,
                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
                              (float *) tmp,
                              (float *) tmp,
                              MPI_COMM_WORLD,
                              FFTW_MEASURE);

  icfg= fftwf_mpi_plan_many_transpose(n1,n2,2,
                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
                              (float *) tmp,
                              (float *) tmp,
                              MPI_COMM_WORLD,
                              FFTW_MEASURE);

  if (NULL == cfg || NULL == icfg) sf_error("FFTW failure.");

  *nx2 = n1;
  *ny2 = n2;
  *n_local = (int) local_n0;
  *o_local = (int) local_0_start;
	
  wt =  1.0/(n1*n2);
	
  return (nk*n2);
}