fftwnd_plan rfftw2d_create_plan_specific(int nx, int ny, fftw_direction dir, int flags, fftw_real *in, int istride, fftw_real *out, int ostride) { int n[2]; n[0] = nx; n[1] = ny; return rfftwnd_create_plan_specific(2, n, dir, flags, in, istride, out, ostride); }
fftwnd_plan rfftwnd_create_plan(int rank, const int *n, fftw_direction dir, int flags) { return rfftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1); }
void testnd_in_place(int rank, int *n, fftwnd_plan validated_plan, int alternate_api, int specific) { int istride, ostride, howmany; int N, dim, i, j, k; int nc, nhc, nr; fftw_real *in1, *out3; fftw_complex *in2, *out1, *out2; fftwnd_plan p, ip; int flags = measure_flag | wisdom_flag | FFTW_IN_PLACE; if (coinflip()) flags |= FFTW_THREADSAFE; N = nc = nr = nhc = 1; for (dim = 0; dim < rank; ++dim) N *= n[dim]; if (rank > 0) { nr = n[rank - 1]; nc = N / nr; nhc = nr / 2 + 1; } in1 = (fftw_real *) fftw_malloc(2 * nhc * nc * MAX_STRIDE * sizeof(fftw_real)); out3 = in1; out1 = (fftw_complex *) in1; in2 = (fftw_complex *) fftw_malloc(N * sizeof(fftw_complex)); out2 = (fftw_complex *) fftw_malloc(N * sizeof(fftw_complex)); if (alternate_api && specific && (rank == 2 || rank == 3)) { if (rank == 2) { p = rfftw2d_create_plan_specific(n[0], n[1], FFTW_REAL_TO_COMPLEX, flags, in1, MAX_STRIDE, 0, 0); ip = rfftw2d_create_plan_specific(n[0], n[1], FFTW_COMPLEX_TO_REAL, flags, in1, MAX_STRIDE, 0, 0); } else { p = rfftw3d_create_plan_specific(n[0], n[1], n[2], FFTW_REAL_TO_COMPLEX, flags, in1, MAX_STRIDE, 0, 0); ip = rfftw3d_create_plan_specific(n[0], n[1], n[2], FFTW_COMPLEX_TO_REAL, flags, in1, MAX_STRIDE, 0, 0); } } else if (specific) { p = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX, flags, in1, MAX_STRIDE, in1, MAX_STRIDE); ip = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL, flags, in1, MAX_STRIDE, in1, MAX_STRIDE); } else if (alternate_api && (rank == 2 || rank == 3)) { if (rank == 2) { p = rfftw2d_create_plan(n[0], n[1], FFTW_REAL_TO_COMPLEX, flags); ip = rfftw2d_create_plan(n[0], n[1], FFTW_COMPLEX_TO_REAL, flags); } else { p = rfftw3d_create_plan(n[0], n[1], n[2], FFTW_REAL_TO_COMPLEX, flags); ip = rfftw3d_create_plan(n[0], n[1], n[2], FFTW_COMPLEX_TO_REAL, flags); } } else { p = rfftwnd_create_plan(rank, n, FFTW_REAL_TO_COMPLEX, flags); ip = rfftwnd_create_plan(rank, n, FFTW_COMPLEX_TO_REAL, flags); } CHECK(p != NULL && ip != NULL, "can't create plan"); for (i = 0; i < nc * nhc * 2 * MAX_STRIDE; ++i) out3[i] = 0; for (istride = 1; istride <= MAX_STRIDE; ++istride) { /* generate random inputs */ for (i = 0; i < nc; ++i) for (j = 0; j < nr; ++j) { c_re(in2[i * nr + j]) = DRAND(); c_im(in2[i * nr + j]) = 0.0; for (k = 0; k < istride; ++k) in1[(i * nhc * 2 + j) * istride + k] = c_re(in2[i * nr + j]); } fftwnd(validated_plan, 1, in2, 1, 1, out2, 1, 1); howmany = ostride = istride; WHEN_VERBOSE(2, printf("\n testing in-place stride %d...", istride)); if (howmany != 1 || istride != 1 || ostride != 1 || coinflip()) rfftwnd_real_to_complex(p, howmany, in1, istride, 1, out1, ostride, 1); else rfftwnd_one_real_to_complex(p, in1, NULL); for (i = 0; i < nc; ++i) for (k = 0; k < howmany; ++k) CHECK(compute_error_complex(out1 + i * nhc * ostride + k, ostride, out2 + i * nr, 1, nhc) < TOLERANCE, "in-place (r2c): wrong answer"); if (howmany != 1 || istride != 1 || ostride != 1 || coinflip()) rfftwnd_complex_to_real(ip, howmany, out1, ostride, 1, out3, istride, 1); else rfftwnd_one_complex_to_real(ip, out1, NULL); for (i = 0; i < nc * nhc * 2 * istride; ++i) out3[i] *= 1.0 / N; for (i = 0; i < nc; ++i) for (k = 0; k < howmany; ++k) CHECK(compute_error(out3 + i * nhc * 2 * istride + k, istride, (fftw_real *) (in2 + i * nr), 2, nr) < TOLERANCE, "in-place (c2r): wrong answer (check 2)"); } rfftwnd_destroy_plan(p); rfftwnd_destroy_plan(ip); fftw_free(out2); fftw_free(in2); fftw_free(in1); }
void test_speed_nd_aux(struct size sz, fftw_direction dir, int flags, int specific) { fftw_real *in; fftwnd_plan plan; double t; fftw_time begin, end; int i, N; /* only bench in-place multi-dim transforms */ flags |= FFTW_IN_PLACE; N = 1; for (i = 0; i < sz.rank - 1; ++i) N *= sz.narray[i]; N *= (sz.narray[i] + 2); in = (fftw_real *) fftw_malloc(N * howmany_fields * sizeof(fftw_real)); if (specific) { begin = fftw_get_time(); plan = rfftwnd_create_plan_specific(sz.rank, sz.narray, dir, speed_flag | flags | wisdom_flag | no_vector_flag, in, howmany_fields, 0, 1); } else { begin = fftw_get_time(); plan = rfftwnd_create_plan(sz.rank, sz.narray, dir, speed_flag | flags | wisdom_flag | no_vector_flag); } end = fftw_get_time(); CHECK(plan != NULL, "can't create plan"); t = fftw_time_to_sec(fftw_time_diff(end, begin)); WHEN_VERBOSE(2, printf("time for planner: %f s\n", t)); WHEN_VERBOSE(2, printf("\n")); WHEN_VERBOSE(2, (rfftwnd_print_plan(plan))); WHEN_VERBOSE(2, printf("\n")); if (dir == FFTW_REAL_TO_COMPLEX) { FFTW_TIME_FFT(rfftwnd_real_to_complex(plan, howmany_fields, in, howmany_fields, 1, 0, 0, 0), in, N * howmany_fields, t); } else { FFTW_TIME_FFT(rfftwnd_complex_to_real(plan, howmany_fields, (fftw_complex *) in, howmany_fields, 1, 0, 0, 0), in, N * howmany_fields, t); } rfftwnd_destroy_plan(plan); WHEN_VERBOSE(1, printf("time for one fft: %s", smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N))); WHEN_VERBOSE(1, printf("\"mflops\" = 5/2 (N log2 N) / (t in microseconds)" " = %f\n", 0.5 * howmany_fields * mflops(t, N))); fftw_free(in); WHEN_VERBOSE(1, printf("\n")); }
maxwell_data *create_maxwell_data(int nx, int ny, int nz, int *local_N, int *N_start, int *alloc_N, int num_bands, int max_fft_bands) { int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; maxwell_data *d = 0; int fft_data_size; n[0] = nx; n[1] = ny; n[2] = nz; #if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3) # error Non-FFTW FFTs are not currently supported. #endif #if defined(HAVE_FFTW) CHECK(sizeof(fftw_real) == sizeof(real), "floating-point type is inconsistent with FFTW!"); #endif CHK_MALLOC(d, maxwell_data, 1); d->nx = nx; d->ny = ny; d->nz = nz; d->max_fft_bands = MIN2(num_bands, max_fft_bands); maxwell_set_num_bands(d, num_bands); d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0; d->parity = NO_PARITY; d->last_dim_size = d->last_dim = n[rank - 1]; /* ----------------------------------------------------- */ d->nplans = 1; #ifndef HAVE_MPI d->local_nx = nx; d->local_ny = ny; d->local_x_start = d->local_y_start = 0; *local_N = *alloc_N = nx * ny * nz; *N_start = 0; d->other_dims = *local_N / d->last_dim; d->fft_data = 0; /* initialize it here for use in specific planner? */ # if defined(HAVE_FFTW3) d->nplans = 0; /* plans will be created as needed */ # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; # else d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; # endif # elif defined(HAVE_FFTW) # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); # else /* not SCALAR_COMPLEX */ d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); # endif /* not SCALAR_COMPLEX */ # endif /* HAVE_FFTW */ #else /* HAVE_MPI */ /* ----------------------------------------------------- */ # if defined(HAVE_FFTW3) { int i; ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start; CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->nplans = 0; /* plans will be created as needed */ for (i = 0; i < rank; ++i) np[i] = n[i]; # ifndef SCALAR_COMPLEX d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1); # endif fft_data_size = *alloc_N = FFTW(mpi_local_size_transposed)(rank, np, MPI_COMM_WORLD, &local_nx, &local_x_start, &local_ny, &local_y_start); # ifndef SCALAR_COMPLEX fft_data_size = (*alloc_N *= 2); // convert to # of real scalars # endif d->local_nx = local_nx; d->local_x_start = local_x_start; d->local_ny = local_ny; d->local_y_start = local_y_start; d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz); *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; d->other_dims = *local_N / d->last_dim; } # elif defined(HAVE_FFTW) CHECK(rank > 1, "rank < 2 MPI computations are not supported"); # ifdef SCALAR_COMPLEX d->iplans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); { int nt[3]; /* transposed dimensions for reverse FFT */ nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; d->plans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, nt, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); } fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->fft_output_size = nx * d->local_ny * nz; # else /* not SCALAR_COMPLEX */ CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->iplans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for the reverse transform here--we always pass the dimensions of the original real array, and rfftwnd_mpi assumes that if one transform is transposed, then the other is as well. */ d->plans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->last_dim_size = 2 * (d->last_dim / 2 + 1); if (rank == 2) d->fft_output_size = nx * d->local_ny * nz; else d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2); # endif /* not SCALAR_COMPLEX */ *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; *alloc_N = *local_N; d->other_dims = *local_N / d->last_dim; # endif /* HAVE_FFTW */ #endif /* HAVE_MPI */ /* ----------------------------------------------------- */ #ifdef HAVE_FFTW CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed"); #endif CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size); /* A scratch output array is required because the "ordinary" arrays are not in a cartesian basis (or even a constant basis). */ fft_data_size *= d->max_fft_bands; #if defined(HAVE_FFTW3) d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size); CHECK(d->fft_data, "out of memory!"); d->fft_data2 = d->fft_data; /* works in-place */ #else CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size); d->fft_data2 = d->fft_data; /* works in-place */ #endif CHK_MALLOC(d->k_plus_G, k_data, *local_N); CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N); d->eps_inv_mean = 1.0; d->local_N = *local_N; d->N_start = *N_start; d->alloc_N = *alloc_N; d->N = nx * ny * nz; return d; }