void rdwisdom(void) { FILE *f; double tim; int success = 0; if (havewisdom) return; #ifdef HAVE_SMP BENCH_ASSERT(FFTW(init_threads)()); FFTW(plan_with_nthreads)(nthreads); #endif if (!usewisdom) return; timer_start(USER_TIMER); if ((f = fopen(wisdat, "r"))) { if (!import_wisdom(f)) fprintf(stderr, "bench: ERROR reading wisdom\n"); else success = 1; fclose(f); } tim = timer_stop(USER_TIMER); if (success) { if (verbose > 1) printf("READ WISDOM (%g seconds): ", tim); if (verbose > 3) export_wisdom(stdout); if (verbose > 1) printf("\n"); } havewisdom = 1; }
static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags) { FFTW(plan) pln = 0; int i; ptrdiff_t ntot; vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; if (p->sz->rnk < 2 || p->split || !tensor_real_contiguousp(p->sz, p->sign, vn) || tensor_rowmajor_transposedp(p->sz) || p->vecsz->rnk > 1 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 || p->vecsz->dims[0].os != 1))) return 0; alloc_rnk(p->sz->rnk); for (i = 0; i < rnk; ++i) { total_ni[i] = total_no[i] = p->sz->dims[i].n; local_ni[i] = local_no[i] = total_ni[i]; local_starti[i] = local_starto[i] = 0; } local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] = p->sz->dims[rnk-1].n / 2 + 1; { ptrdiff_t n, start, nT, startT; ntot = FFTW(mpi_local_size_many_transposed) (p->sz->rnk, total_ni, vn, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD, &n, &start, &nT, &startT); if (flags & FFTW_MPI_TRANSPOSED_IN) { local_ni[1] = nT; local_starti[1] = startT; } else { local_ni[0] = n; local_starti[0] = start; } if (flags & FFTW_MPI_TRANSPOSED_OUT) { local_no[1] = nT; local_starto[1] = startT; } else { local_no[0] = n; local_starto[0] = start; } } alloc_local(ntot * 2, p->in == p->out); total_ni[rnk - 1] = p->sz->dims[rnk - 1].n; if (p->sign < 0) pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, local_in, (FFTW(complex) *) local_out, MPI_COMM_WORLD, flags); else
void useropt(const char *arg) { int x; double y; if (!strcmp(arg, "patient")) the_flags |= FFTW_PATIENT; else if (!strcmp(arg, "estimate")) the_flags |= FFTW_ESTIMATE; else if (!strcmp(arg, "estimatepat")) the_flags |= FFTW_ESTIMATE_PATIENT; else if (!strcmp(arg, "exhaustive")) the_flags |= FFTW_EXHAUSTIVE; else if (!strcmp(arg, "unaligned")) the_flags |= FFTW_UNALIGNED; else if (!strcmp(arg, "nosimd")) the_flags |= FFTW_NO_SIMD; else if (!strcmp(arg, "noindirectop")) the_flags |= FFTW_NO_INDIRECT_OP; else if (!strcmp(arg, "wisdom-only")) the_flags |= FFTW_WISDOM_ONLY; else if (sscanf(arg, "flag=%d", &x) == 1) the_flags |= x; else if (sscanf(arg, "bflag=%d", &x) == 1) the_flags |= 1U << x; else if (!strcmp(arg, "paranoid")) paranoid = 1; else if (!strcmp(arg, "wisdom")) usewisdom = 1; else if (!strcmp(arg, "amnesia")) amnesia = 1; else if (sscanf(arg, "nthreads=%d", &x) == 1) nthreads = x; #if HAVE_CELL else if (sscanf(arg, "nspe=%d", &x) == 1) FFTW(cell_set_nspe)(x); #endif else if (sscanf(arg, "timelimit=%lg", &y) == 1) { FFTW(set_timelimit)(y); } else fprintf(stderr, "unknown user option: %s. Ignoring.\n", arg); }
void doit(int iter, bench_problem *p) { int i; FFTW(plan) q = the_plan; UNUSED(p); for (i = 0; i < iter; ++i) FFTW(execute)(q); }
void done(bench_problem *p) { UNUSED(p); FFTW(destroy_plan)(the_plan); uninstall_hook(); }
void rdwisdom(void) { FILE *f; double tim; int success = 0; if (havewisdom) return; #ifdef HAVE_SMP if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); FFTW(plan_with_nthreads)(nthreads); FFTW(make_planner_thread_safe)(); #ifdef _OPENMP omp_set_num_threads(nthreads); #endif } else if (nthreads > 1 && verbose > 1) { fprintf(stderr, "bench: WARNING - nthreads = %d, but threads not supported\n", nthreads); nthreads = 1; } #endif if (!usewisdom) return; timer_start(USER_TIMER); if ((f = fopen(wisdat, "r"))) { if (!import_wisdom(f)) fprintf(stderr, "bench: ERROR reading wisdom\n"); else success = 1; fclose(f); } tim = timer_stop(USER_TIMER); if (success) { if (verbose > 1) printf("READ WISDOM (%g seconds): ", tim); if (verbose > 3) export_wisdom(stdout); if (verbose > 1) printf("\n"); } havewisdom = 1; }
void destroy_maxwell_data(maxwell_data *d) { if (d) { int i; for (i = 0; i < d->nplans; ++i) { #if defined(HAVE_FFTW3) FFTW(destroy_plan)((fftplan) (d->plans[i])); FFTW(destroy_plan)((fftplan) (d->iplans[i])); #elif defined(HAVE_FFTW) # ifdef HAVE_MPI # ifdef SCALAR_COMPLEX fftwnd_mpi_destroy_plan((fftplan) (d->plans[i])); fftwnd_mpi_destroy_plan((fftplan) (d->iplans[i])); # else /* not SCALAR_COMPLEX */ rfftwnd_mpi_destroy_plan((fftplan) (d->plans[i])); rfftwnd_mpi_destroy_plan((fftplan) (d->iplans[i])); # endif /* not SCALAR_COMPLEX */ # else /* not HAVE_MPI */ # ifdef SCALAR_COMPLEX fftwnd_destroy_plan((fftplan) (d->plans[i])); fftwnd_destroy_plan((fftplan) (d->iplans[i])); # else /* not SCALAR_COMPLEX */ rfftwnd_destroy_plan((fftplan) (d->plans[i])); rfftwnd_destroy_plan((fftplan) (d->iplans[i])); # endif /* not SCALAR_COMPLEX */ # endif /* not HAVE_MPI */ #endif /* HAVE FFTW */ } free(d->eps_inv); #if defined(HAVE_FFTW3) FFTW(free)(d->fft_data); if (d->fft_data2 != d->fft_data) FFTW(free)(d->fft_data2); #else free(d->fft_data); #endif free(d->k_plus_G); free(d->k_plus_G_normsqr); free(d); } }
void cleanup(void) { initial_cleanup(); wrwisdom(); #ifdef HAVE_SMP FFTW(cleanup_threads)(); #else FFTW(cleanup)(); #endif # ifdef FFTW_DEBUG_MALLOC { /* undocumented memory checker */ FFTW_EXTERN void FFTW(malloc_print_minfo)(int v); FFTW(malloc_print_minfo)(verbose); } # endif final_cleanup(); }
static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t vn, bench_real *in, bench_real *out) { FFTW(iodim64) hdims[3]; FFTW(r2r_kind) k[3]; FFTW(plan) pln; hdims[0].n = nx; hdims[0].is = ny * vn; hdims[0].os = vn; hdims[1].n = ny; hdims[1].is = vn; hdims[1].os = nx * vn; hdims[2].n = vn; hdims[2].is = 1; hdims[2].os = 1; k[0] = k[1] = k[2] = FFTW_R2HC; pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE); BENCH_ASSERT(pln != 0); return pln; }
void setup(bench_problem *p) { double tim; setup_sigfpe_handler(); if (amnesia) { FFTW(forget_wisdom)(); havewisdom = 0; } /* Regression test: check that fftw_malloc exists and links * properly */ { void *ptr = FFTW(malloc(42)); //BENCH_ASSERT(FFTW(alignment_of)(ptr) == 0); FFTW(free(ptr)); } rdwisdom(); install_hook(); #ifdef HAVE_SMP if (verbose > 1 && nthreads > 1) printf("NTHREADS = %d\n", nthreads); #endif timer_start(USER_TIMER); the_plan = mkplan(p, preserve_input_flags(p) | the_flags); tim = timer_stop(USER_TIMER); if (verbose > 1) printf("planner time: %g s\n", tim); BENCH_ASSERT(the_plan); { double add, mul, nfma, cost, pcost; FFTW(flops)(the_plan, &add, &mul, &nfma); cost = FFTW(estimate_cost)(the_plan); pcost = FFTW(cost)(the_plan); if (verbose > 1) { FFTW(print_plan)(the_plan); printf("\n"); printf("flops: %0.0f add, %0.0f mul, %0.0f fma\n", add, mul, nfma); printf("estimated cost: %f, pcost = %f\n", cost, pcost); } } }
int can_do(bench_problem *p) { double tim; if (verbose > 2 && p->pstring) printf("Planning %s...\n", p->pstring); rdwisdom(); timer_start(USER_TIMER); the_plan = mkplan(p, preserve_input_flags(p) | the_flags | FFTW_ESTIMATE); tim = timer_stop(USER_TIMER); if (verbose > 2) printf("estimate-planner time: %g s\n", tim); if (the_plan) { FFTW(destroy_plan)(the_plan); return 1; } return 0; }
static const char *mkversion(void) { return FFTW(version); }
void after_problem_rcopy_to(bench_problem *p, bench_real *ro) { UNUSED(p); if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out); do_gather_out(ro); }
/* NxMxK the size of the data * comm communicator to use for fft5d * P0 number of processor in 1st axes (can be null for automatic) * lin is allocated by fft5d because size of array is only known after planning phase * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller */ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) { int P[2], bMaster, prank[2], i, t; int rNG, rMG, rKG; int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0; int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0}; int C[3], rC[3], nP[2]; int lsize; t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0; fft5d_plan plan; int s; /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */ #ifdef GMX_MPI if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL) { MPI_Comm_size(comm[0], &P[0]); MPI_Comm_rank(comm[0], &prank[0]); } else #endif { P[0] = 1; prank[0] = 0; } #ifdef GMX_MPI if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL) { MPI_Comm_size(comm[1], &P[1]); MPI_Comm_rank(comm[1], &prank[1]); } else #endif { P[1] = 1; prank[1] = 0; } bMaster = (prank[0] == 0 && prank[1] == 0); if (debug) { fprintf(debug, "FFT5D: Using %dx%d processor grid, rank %d,%d\n", P[0], P[1], prank[0], prank[1]); } if (bMaster) { if (debug) { fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n", NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0); } /* The check below is not correct, one prime factor 11 or 13 is ok. if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) { printf("WARNING: FFT very slow with prime factors larger 7\n"); printf("Change FFT size or in case you cannot change it look at\n"); printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n"); } */ } if (NG == 0 || MG == 0 || KG == 0) { if (bMaster) { printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n"); } return 0; } rNG = NG; rMG = MG; rKG = KG; if (flags&FFT5D_REALCOMPLEX) { if (!(flags&FFT5D_BACKWARD)) { NG = NG/2+1; } else { if (!(flags&FFT5D_ORDER_YZ)) { MG = MG/2+1; } else { KG = KG/2+1; } } } /*for transpose we need to know the size for each processor not only our own size*/ N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int)); M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int)); K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int)); oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int)); oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int)); oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int)); for (i = 0; i < P[0]; i++) { #define EVENDIST #ifndef EVENDIST oN0[i] = i*ceil((double)NG/P[0]); oM0[i] = i*ceil((double)MG/P[0]); oK0[i] = i*ceil((double)KG/P[0]); #else oN0[i] = (NG*i)/P[0]; oM0[i] = (MG*i)/P[0]; oK0[i] = (KG*i)/P[0]; #endif } for (i = 0; i < P[1]; i++) { #ifndef EVENDIST oN1[i] = i*ceil((double)NG/P[1]); oM1[i] = i*ceil((double)MG/P[1]); oK1[i] = i*ceil((double)KG/P[1]); #else oN1[i] = (NG*i)/P[1]; oM1[i] = (MG*i)/P[1]; oK1[i] = (KG*i)/P[1]; #endif } for (i = 0; i < P[0]-1; i++) { N0[i] = oN0[i+1]-oN0[i]; M0[i] = oM0[i+1]-oM0[i]; K0[i] = oK0[i+1]-oK0[i]; } N0[P[0]-1] = NG-oN0[P[0]-1]; M0[P[0]-1] = MG-oM0[P[0]-1]; K0[P[0]-1] = KG-oK0[P[0]-1]; for (i = 0; i < P[1]-1; i++) { N1[i] = oN1[i+1]-oN1[i]; M1[i] = oM1[i+1]-oM1[i]; K1[i] = oK1[i+1]-oK1[i]; } N1[P[1]-1] = NG-oN1[P[1]-1]; M1[P[1]-1] = MG-oM1[P[1]-1]; K1[P[1]-1] = KG-oK1[P[1]-1]; /* for step 1-3 the local N,M,K sizes of the transposed system C: contiguous dimension, and nP: number of processor in subcommunicator for that step */ pM[0] = M0[prank[0]]; oM[0] = oM0[prank[0]]; pK[0] = K1[prank[1]]; oK[0] = oK1[prank[1]]; C[0] = NG; rC[0] = rNG; if (!(flags&FFT5D_ORDER_YZ)) { N[0] = vmax(N1, P[1]); M[0] = M0[prank[0]]; K[0] = vmax(K1, P[1]); pN[0] = N1[prank[1]]; iNout[0] = N1; oNout[0] = oN1; nP[0] = P[1]; C[1] = KG; rC[1] = rKG; N[1] = vmax(K0, P[0]); pN[1] = K0[prank[0]]; iNin[1] = K1; oNin[1] = oK1; iNout[1] = K0; oNout[1] = oK0; M[1] = vmax(M0, P[0]); pM[1] = M0[prank[0]]; oM[1] = oM0[prank[0]]; K[1] = N1[prank[1]]; pK[1] = N1[prank[1]]; oK[1] = oN1[prank[1]]; nP[1] = P[0]; C[2] = MG; rC[2] = rMG; iNin[2] = M0; oNin[2] = oM0; M[2] = vmax(K0, P[0]); pM[2] = K0[prank[0]]; oM[2] = oK0[prank[0]]; K[2] = vmax(N1, P[1]); pK[2] = N1[prank[1]]; oK[2] = oN1[prank[1]]; free(N0); free(oN0); /*these are not used for this order*/ free(M1); free(oM1); /*the rest is freed in destroy*/ } else { N[0] = vmax(N0, P[0]); M[0] = vmax(M0, P[0]); K[0] = K1[prank[1]]; pN[0] = N0[prank[0]]; iNout[0] = N0; oNout[0] = oN0; nP[0] = P[0]; C[1] = MG; rC[1] = rMG; N[1] = vmax(M1, P[1]); pN[1] = M1[prank[1]]; iNin[1] = M0; oNin[1] = oM0; iNout[1] = M1; oNout[1] = oM1; M[1] = N0[prank[0]]; pM[1] = N0[prank[0]]; oM[1] = oN0[prank[0]]; K[1] = vmax(K1, P[1]); pK[1] = K1[prank[1]]; oK[1] = oK1[prank[1]]; nP[1] = P[1]; C[2] = KG; rC[2] = rKG; iNin[2] = K1; oNin[2] = oK1; M[2] = vmax(N0, P[0]); pM[2] = N0[prank[0]]; oM[2] = oN0[prank[0]]; K[2] = vmax(M1, P[1]); pK[2] = M1[prank[1]]; oK[2] = oM1[prank[1]]; free(N1); free(oN1); /*these are not used for this order*/ free(K0); free(oK0); /*the rest is freed in destroy*/ } N[2] = pN[2] = -1; /*not used*/ /* Difference between x-y-z regarding 2d decomposition is whether they are distributed along axis 1, 2 or both */ /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */ lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2])); /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */ if (!(flags&FFT5D_NOMALLOC)) { snew_aligned(lin, lsize, 32); snew_aligned(lout, lsize, 32); if (nthreads > 1) { /* We need extra transpose buffers to avoid OpenMP barriers */ snew_aligned(lout2, lsize, 32); snew_aligned(lout3, lsize, 32); } else { /* We can reuse the buffers to avoid cache misses */ lout2 = lin; lout3 = lout; } } else { lin = *rlin; lout = *rlout; if (nthreads > 1) { lout2 = *rlout2; lout3 = *rlout3; } else { lout2 = lin; lout3 = lout; } } plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t)); if (debug) { fprintf(debug, "Running on %d threads\n", nthreads); } #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */ /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from * within a parallel region. For now deactivated. If it should be supported it has to made sure that * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region) * and that the 3d plan is faster than the 1d plan. */ if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */ { int fftwflags = FFTW_DESTROY_INPUT; FFTW(iodim) dims[3]; int inNG = NG, outMG = MG, outKG = KG; FFTW_LOCK; if (!(flags&FFT5D_NOMEASURE)) { fftwflags |= FFTW_MEASURE; } if (flags&FFT5D_REALCOMPLEX) { if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/ { inNG *= 2; } else /*output pointer is not complex*/ { if (!(flags&FFT5D_ORDER_YZ)) { outMG *= 2; } else { outKG *= 2; } } } if (!(flags&FFT5D_BACKWARD)) { dims[0].n = KG; dims[1].n = MG; dims[2].n = rNG; dims[0].is = inNG*MG; /*N M K*/ dims[1].is = inNG; dims[2].is = 1; if (!(flags&FFT5D_ORDER_YZ)) { dims[0].os = MG; /*M K N*/ dims[1].os = 1; dims[2].os = MG*KG; } else { dims[0].os = 1; /*K N M*/ dims[1].os = KG*NG; dims[2].os = KG; } } else { if (!(flags&FFT5D_ORDER_YZ)) { dims[0].n = NG; dims[1].n = KG; dims[2].n = rMG; dims[0].is = 1; dims[1].is = NG*MG; dims[2].is = NG; dims[0].os = outMG*KG; dims[1].os = outMG; dims[2].os = 1; } else { dims[0].n = MG; dims[1].n = NG; dims[2].n = rKG; dims[0].is = NG; dims[1].is = 1; dims[2].is = NG*MG; dims[0].os = outKG*NG; dims[1].os = outKG; dims[2].os = 1; } } #ifdef FFT5D_THREADS #ifdef FFT5D_FFTW_THREADS FFTW(plan_with_nthreads)(nthreads); #endif #endif if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) { plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims, /*howmany*/ 0, /*howmany_dims*/ 0, (real*)lin, (FFTW(complex) *) lout, /*flags*/ fftwflags); }
int main(int argc, char **argv) { int n; /**< expansion degree */ int m; /**< cut-off parameter */ int p; /**< degree of smoothness */ char *s; /**< name of kernel */ C (*kernel)(R, int, const R *); /**< kernel function */ R c; /**< parameter for kernel */ R eps_I; /**< inner boundary */ R eps_B; /**< outer boundary */ #ifdef _OPENMP int nthreads; if (argc != 9) return EXIT_FAILURE; nthreads = atoi(argv[8]); FFTW(init_threads)(); omp_set_num_threads(nthreads); #else if (argc != 8) return EXIT_FAILURE; #endif n = atoi(argv[1]); m = atoi(argv[2]); p = atoi(argv[3]); s = argv[4]; c = atof(argv[5]); eps_I = (R)(atof(argv[6])); eps_B = (R)(atof(argv[7])); if (strcmp(s, "gaussian") == 0) kernel = gaussian; else if (strcmp(s, "multiquadric") == 0) kernel = multiquadric; else if (strcmp(s, "inverse_multiquadric") == 0) kernel = inverse_multiquadric; else if (strcmp(s, "logarithm") == 0) kernel = logarithm; else if (strcmp(s, "thinplate_spline") == 0) kernel = thinplate_spline; else if (strcmp(s, "one_over_square") == 0) kernel = one_over_square; else if (strcmp(s, "one_over_modulus") == 0) kernel = one_over_modulus; else if (strcmp(s, "one_over_x") == 0) kernel = one_over_x; else if (strcmp(s, "inverse_multiquadric3") == 0) kernel = inverse_multiquadric3; else if (strcmp(s, "sinc_kernel") == 0) kernel = sinc_kernel; else if (strcmp(s, "cosc") == 0) kernel = cosc; else if (strcmp(s, "cot") == 0) kernel = kcot; else { s = "multiquadric"; kernel = multiquadric; } bench_openmp(stdin, n, m, p, kernel, c, eps_I, eps_B); return EXIT_SUCCESS; }
static const char *mkcc(void) { return FFTW(cc); }
static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags) { FFTW(plan) pln = 0; int i; ptrdiff_t ntot; vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; if (p->sz->rnk < 1 || p->split || !tensor_contiguousp(p->sz, vn) || tensor_rowmajor_transposedp(p->sz) || p->vecsz->rnk > 1 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 || p->vecsz->dims[0].os != 1))) return 0; alloc_rnk(p->sz->rnk); for (i = 0; i < rnk; ++i) { total_ni[i] = total_no[i] = p->sz->dims[i].n; local_ni[i] = local_no[i] = total_ni[i]; local_starti[i] = local_starto[i] = 0; } if (rnk > 1) { ptrdiff_t n, start, nT, startT; ntot = FFTW(mpi_local_size_many_transposed) (p->sz->rnk, total_ni, vn, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, MPI_COMM_WORLD, &n, &start, &nT, &startT); if (flags & FFTW_MPI_TRANSPOSED_IN) { local_ni[1] = nT; local_starti[1] = startT; } else { local_ni[0] = n; local_starti[0] = start; } if (flags & FFTW_MPI_TRANSPOSED_OUT) { local_no[1] = nT; local_starto[1] = startT; } else { local_no[0] = n; local_starto[0] = start; } } else if (rnk == 1) { ntot = FFTW(mpi_local_size_many_1d) (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags, local_ni, local_starti, local_no, local_starto); } alloc_local(ntot * 2, p->in == p->out); pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, (FFTW(complex) *) local_in, (FFTW(complex) *) local_out, MPI_COMM_WORLD, p->sign, flags); vn *= 2; if (rnk > 1) { ptrdiff_t nrest = 1; for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n; if (flags & FFTW_MPI_TRANSPOSED_IN) plan_scramble_in = mkplan_transpose_local( p->sz->dims[0].n, local_ni[1], vn * nrest, local_in, local_in); if (flags & FFTW_MPI_TRANSPOSED_OUT) plan_unscramble_out = mkplan_transpose_local( local_no[1], p->sz->dims[0].n, vn * nrest, local_out, local_out); } return pln; }
int main(int argc, char **argv) { int j, k; /**< indices */ int d; /**< number of dimensions */ int N; /**< number of source nodes */ int M; /**< number of target nodes */ int n; /**< expansion degree */ int m; /**< cut-off parameter */ int p; /**< degree of smoothness */ const char *s; /**< name of kernel */ C (*kernel)(R, int, const R *); /**< kernel function */ R c; /**< parameter for kernel */ fastsum_plan my_fastsum_plan; /**< plan for fast summation */ C *direct; /**< array for direct computation */ ticks t0, t1; /**< for time measurement */ R time; /**< for time measurement */ R error = K(0.0); /**< for error computation */ R eps_I; /**< inner boundary */ R eps_B; /**< outer boundary */ if (argc != 11) { printf("\nfastsum_test d N M n m p kernel c eps_I eps_B\n\n"); printf(" d dimension \n"); printf(" N number of source nodes \n"); printf(" M number of target nodes \n"); printf(" n expansion degree \n"); printf(" m cut-off parameter \n"); printf(" p degree of smoothness \n"); printf(" kernel kernel function (e.g., gaussian)\n"); printf(" c kernel parameter \n"); printf(" eps_I inner boundary \n"); printf(" eps_B outer boundary \n\n"); exit(EXIT_FAILURE); } else { d = atoi(argv[1]); N = atoi(argv[2]); c = K(1.0) / POW((R)(N), K(1.0) / ((R)(d))); M = atoi(argv[3]); n = atoi(argv[4]); m = atoi(argv[5]); p = atoi(argv[6]); s = argv[7]; c = (R)(atof(argv[8])); eps_I = (R)(atof(argv[9])); eps_B = (R)(atof(argv[10])); if (strcmp(s, "gaussian") == 0) kernel = gaussian; else if (strcmp(s, "multiquadric") == 0) kernel = multiquadric; else if (strcmp(s, "inverse_multiquadric") == 0) kernel = inverse_multiquadric; else if (strcmp(s, "logarithm") == 0) kernel = logarithm; else if (strcmp(s, "thinplate_spline") == 0) kernel = thinplate_spline; else if (strcmp(s, "one_over_square") == 0) kernel = one_over_square; else if (strcmp(s, "one_over_modulus") == 0) kernel = one_over_modulus; else if (strcmp(s, "one_over_x") == 0) kernel = one_over_x; else if (strcmp(s, "inverse_multiquadric3") == 0) kernel = inverse_multiquadric3; else if (strcmp(s, "sinc_kernel") == 0) kernel = sinc_kernel; else if (strcmp(s, "cosc") == 0) kernel = cosc; else if (strcmp(s, "cot") == 0) kernel = kcot; else { s = "multiquadric"; kernel = multiquadric; } } printf( "d=%d, N=%d, M=%d, n=%d, m=%d, p=%d, kernel=%s, c=%" __FGS__ ", eps_I=%" __FGS__ ", eps_B=%" __FGS__ " \n", d, N, M, n, m, p, s, c, eps_I, eps_B); #ifdef NF_KUB printf("nearfield correction using piecewise cubic Lagrange interpolation\n"); #elif defined(NF_QUADR) printf("nearfield correction using piecewise quadratic Lagrange interpolation\n"); #elif defined(NF_LIN) printf("nearfield correction using piecewise linear Lagrange interpolation\n"); #endif #ifdef _OPENMP #pragma omp parallel { #pragma omp single { printf("nthreads=%d\n", omp_get_max_threads()); } } FFTW(init_threads)(); #endif /** init d-dimensional fastsum plan */ fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, 0, n, m, p, eps_I, eps_B); //fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, NEARFIELD_BOXES, n, m, p, eps_I, eps_B); if (my_fastsum_plan.flags & NEARFIELD_BOXES) printf( "determination of nearfield candidates based on partitioning into boxes\n"); else printf("determination of nearfield candidates based on search tree\n"); /** init source knots in a d-ball with radius 0.25-eps_b/2 */ k = 0; while (k < N) { R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0); R r2 = K(0.0); for (j = 0; j < d; j++) my_fastsum_plan.x[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max; for (j = 0; j < d; j++) r2 += my_fastsum_plan.x[k * d + j] * my_fastsum_plan.x[k * d + j]; if (r2 >= r_max * r_max) continue; k++; } for (k = 0; k < N; k++) { /* R r=(0.25-my_fastsum_plan.eps_B/2.0)*pow((R)rand()/(R)RAND_MAX,1.0/d); my_fastsum_plan.x[k*d+0] = r; for (j=1; j<d; j++) { R phi=2.0*KPI*(R)rand()/(R)RAND_MAX; my_fastsum_plan.x[k*d+j] = r; for (t=0; t<j; t++) { my_fastsum_plan.x[k*d+t] *= cos(phi); } my_fastsum_plan.x[k*d+j] *= sin(phi); } */ my_fastsum_plan.alpha[k] = NFFT(drand48)() + II * NFFT(drand48)(); } /** init target knots in a d-ball with radius 0.25-eps_b/2 */ k = 0; while (k < M) { R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0); R r2 = K(0.0); for (j = 0; j < d; j++) my_fastsum_plan.y[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max; for (j = 0; j < d; j++) r2 += my_fastsum_plan.y[k * d + j] * my_fastsum_plan.y[k * d + j]; if (r2 >= r_max * r_max) continue; k++; } /* for (k=0; k<M; k++) { R r=(0.25-my_fastsum_plan.eps_B/2.0)*pow((R)rand()/(R)RAND_MAX,1.0/d); my_fastsum_plan.y[k*d+0] = r; for (j=1; j<d; j++) { R phi=2.0*KPI*(R)rand()/(R)RAND_MAX; my_fastsum_plan.y[k*d+j] = r; for (t=0; t<j; t++) { my_fastsum_plan.y[k*d+t] *= cos(phi); } my_fastsum_plan.y[k*d+j] *= sin(phi); } } */ /** direct computation */ printf("direct computation: "); fflush(NULL); t0 = getticks(); fastsum_exact(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** copy result */ direct = (C *) NFFT(malloc)((size_t)(my_fastsum_plan.M_total) * (sizeof(C))); for (j = 0; j < my_fastsum_plan.M_total; j++) direct[j] = my_fastsum_plan.f[j]; /** precomputation */ printf("pre-computation: "); fflush(NULL); t0 = getticks(); fastsum_precompute(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** fast computation */ printf("fast computation: "); fflush(NULL); t0 = getticks(); fastsum_trafo(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** compute max error */ error = K(0.0); for (j = 0; j < my_fastsum_plan.M_total; j++) { if (CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]) > error) error = CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]); } printf("max relative error: %" __FES__ "\n", error); /** finalise the plan */ fastsum_finalize(&my_fastsum_plan); return EXIT_SUCCESS; }
maxwell_data *create_maxwell_data(int nx, int ny, int nz, int *local_N, int *N_start, int *alloc_N, int num_bands, int max_fft_bands) { int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3; maxwell_data *d = 0; int fft_data_size; n[0] = nx; n[1] = ny; n[2] = nz; #if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3) # error Non-FFTW FFTs are not currently supported. #endif #if defined(HAVE_FFTW) CHECK(sizeof(fftw_real) == sizeof(real), "floating-point type is inconsistent with FFTW!"); #endif CHK_MALLOC(d, maxwell_data, 1); d->nx = nx; d->ny = ny; d->nz = nz; d->max_fft_bands = MIN2(num_bands, max_fft_bands); maxwell_set_num_bands(d, num_bands); d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0; d->parity = NO_PARITY; d->last_dim_size = d->last_dim = n[rank - 1]; /* ----------------------------------------------------- */ d->nplans = 1; #ifndef HAVE_MPI d->local_nx = nx; d->local_ny = ny; d->local_x_start = d->local_y_start = 0; *local_N = *alloc_N = nx * ny * nz; *N_start = 0; d->other_dims = *local_N / d->last_dim; d->fft_data = 0; /* initialize it here for use in specific planner? */ # if defined(HAVE_FFTW3) d->nplans = 0; /* plans will be created as needed */ # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; # else d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; # endif # elif defined(HAVE_FFTW) # ifdef SCALAR_COMPLEX d->fft_output_size = fft_data_size = nx * ny * nz; d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands, (fftw_complex*) d->fft_data, 3 * d->num_fft_bands); # else /* not SCALAR_COMPLEX */ d->last_dim_size = 2 * (d->last_dim / 2 + 1); d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2; d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE, (fftw_real*) d->fft_data, 3 * d->num_fft_bands, (fftw_real*) d->fft_data, 3 * d->num_fft_bands); # endif /* not SCALAR_COMPLEX */ # endif /* HAVE_FFTW */ #else /* HAVE_MPI */ /* ----------------------------------------------------- */ # if defined(HAVE_FFTW3) { int i; ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start; CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->nplans = 0; /* plans will be created as needed */ for (i = 0; i < rank; ++i) np[i] = n[i]; # ifndef SCALAR_COMPLEX d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1); # endif fft_data_size = *alloc_N = FFTW(mpi_local_size_transposed)(rank, np, MPI_COMM_WORLD, &local_nx, &local_x_start, &local_ny, &local_y_start); # ifndef SCALAR_COMPLEX fft_data_size = (*alloc_N *= 2); // convert to # of real scalars # endif d->local_nx = local_nx; d->local_x_start = local_x_start; d->local_ny = local_ny; d->local_y_start = local_y_start; d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz); *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; d->other_dims = *local_N / d->last_dim; } # elif defined(HAVE_FFTW) CHECK(rank > 1, "rank < 2 MPI computations are not supported"); # ifdef SCALAR_COMPLEX d->iplans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); { int nt[3]; /* transposed dimensions for reverse FFT */ nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; d->plans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, nt, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_IN_PLACE); } fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->fft_output_size = nx * d->local_ny * nz; # else /* not SCALAR_COMPLEX */ CHECK(rank > 1, "rank < 2 MPI computations are not supported"); d->iplans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE); /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for the reverse transform here--we always pass the dimensions of the original real array, and rfftwnd_mpi assumes that if one transform is transposed, then the other is as well. */ d->plans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE); rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start, &d->local_ny, &d->local_y_start, &fft_data_size); d->last_dim_size = 2 * (d->last_dim / 2 + 1); if (rank == 2) d->fft_output_size = nx * d->local_ny * nz; else d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2); # endif /* not SCALAR_COMPLEX */ *local_N = d->local_nx * ny * nz; *N_start = d->local_x_start * ny * nz; *alloc_N = *local_N; d->other_dims = *local_N / d->last_dim; # endif /* HAVE_FFTW */ #endif /* HAVE_MPI */ /* ----------------------------------------------------- */ #ifdef HAVE_FFTW CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed"); #endif CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size); /* A scratch output array is required because the "ordinary" arrays are not in a cartesian basis (or even a constant basis). */ fft_data_size *= d->max_fft_bands; #if defined(HAVE_FFTW3) d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size); CHECK(d->fft_data, "out of memory!"); d->fft_data2 = d->fft_data; /* works in-place */ #else CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size); d->fft_data2 = d->fft_data; /* works in-place */ #endif CHK_MALLOC(d->k_plus_G, k_data, *local_N); CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N); d->eps_inv_mean = 1.0; d->local_N = *local_N; d->N_start = *N_start; d->alloc_N = *alloc_N; d->N = nx * ny * nz; return d; }
int bench_openmp(FILE *infile, int n, int m, int p, C (*kernel)(R, int, const R *), R c, R eps_I, R eps_B) { fastsum_plan my_fastsum_plan; int d, L, M; int t, j; R re, im; R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0); ticks t0, t1; R tt_total; fscanf(infile, "%d %d %d", &d, &L, &M); #ifdef _OPENMP FFTW(import_wisdom_from_filename)("fastsum_benchomp_detail_threads.plan"); #else FFTW(import_wisdom_from_filename)("fastsum_benchomp_detail_single.plan"); #endif fastsum_init_guru(&my_fastsum_plan, d, L, M, kernel, &c, NEARFIELD_BOXES, n, m, p, eps_I, eps_B); #ifdef _OPENMP FFTW(export_wisdom_to_filename)("fastsum_benchomp_detail_threads.plan"); #else FFTW(export_wisdom_to_filename)("fastsum_benchomp_detail_single.plan"); #endif for (j = 0; j < L; j++) { for (t = 0; t < d; t++) { R v; fscanf(infile, __FR__, &v); my_fastsum_plan.x[d * j + t] = v * r_max; } } for (j = 0; j < L; j++) { fscanf(infile, __FR__ " " __FR__, &re, &im); my_fastsum_plan.alpha[j] = re + II * im; } for (j = 0; j < M; j++) { for (t = 0; t < d; t++) { R v; fscanf(infile, __FR__, &v); my_fastsum_plan.y[d * j + t] = v * r_max; } } /** precomputation */ t0 = getticks(); fastsum_precompute(&my_fastsum_plan); /** fast computation */ fastsum_trafo(&my_fastsum_plan); t1 = getticks(); tt_total = NFFT(elapsed_seconds)(t1, t0); #ifndef MEASURE_TIME my_fastsum_plan.MEASURE_TIME_t[0] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[1] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[2] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[3] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[4] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[5] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[6] = K(0.0); my_fastsum_plan.MEASURE_TIME_t[7] = K(0.0); my_fastsum_plan.mv1.MEASURE_TIME_t[0] = K(0.0); my_fastsum_plan.mv1.MEASURE_TIME_t[2] = K(0.0); my_fastsum_plan.mv2.MEASURE_TIME_t[0] = K(0.0); my_fastsum_plan.mv2.MEASURE_TIME_t[2] = K(0.0); #endif #ifndef MEASURE_TIME_FFTW my_fastsum_plan.mv1.MEASURE_TIME_t[1] = K(0.0); my_fastsum_plan.mv2.MEASURE_TIME_t[1] = K(0.0); #endif printf( "%.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ "\n", my_fastsum_plan.MEASURE_TIME_t[0], my_fastsum_plan.MEASURE_TIME_t[1], my_fastsum_plan.MEASURE_TIME_t[2], my_fastsum_plan.MEASURE_TIME_t[3], my_fastsum_plan.MEASURE_TIME_t[4], my_fastsum_plan.MEASURE_TIME_t[5], my_fastsum_plan.MEASURE_TIME_t[6], my_fastsum_plan.MEASURE_TIME_t[7], tt_total - my_fastsum_plan.MEASURE_TIME_t[0] - my_fastsum_plan.MEASURE_TIME_t[1] - my_fastsum_plan.MEASURE_TIME_t[2] - my_fastsum_plan.MEASURE_TIME_t[3] - my_fastsum_plan.MEASURE_TIME_t[4] - my_fastsum_plan.MEASURE_TIME_t[5] - my_fastsum_plan.MEASURE_TIME_t[6] - my_fastsum_plan.MEASURE_TIME_t[7], tt_total, my_fastsum_plan.mv1.MEASURE_TIME_t[0], my_fastsum_plan.mv1.MEASURE_TIME_t[1], my_fastsum_plan.mv1.MEASURE_TIME_t[2], my_fastsum_plan.mv2.MEASURE_TIME_t[0], my_fastsum_plan.mv2.MEASURE_TIME_t[1], my_fastsum_plan.mv2.MEASURE_TIME_t[2]); fastsum_finalize(&my_fastsum_plan); return 0; }
/** computes the inverse discrete Radon transform of Rf * on the grid given by gridfcn() with T angles and R offsets * by a NFFT-based CG-type algorithm */ static int inverse_radon_trafo(int (*gridfcn)(), int T, int S, NFFT_R *Rf, int NN, NFFT_R *f, int max_i) { int j, k; /**< index for nodes and freqencies */ NFFT(plan) my_nfft_plan; /**< plan for the nfft-2D */ SOLVER(plan_complex) my_infft_plan; /**< plan for the inverse nfft */ NFFT_C *fft; /**< variable for the fftw-1Ds */ FFTW(plan) my_fftw_plan; /**< plan for the fftw-1Ds */ int t, r; /**< index for directions and offsets */ NFFT_R *x, *w; /**< knots and associated weights */ int l; /**< index for iterations */ int N[2], n[2]; int M = T * S; N[0] = NN; n[0] = 2 * N[0]; N[1] = NN; n[1] = 2 * N[1]; fft = (NFFT_C *) NFFT(malloc)((size_t)(S) * sizeof(NFFT_C)); my_fftw_plan = FFTW(plan_dft_1d)(S, fft, fft, FFTW_FORWARD, FFTW_MEASURE); x = (NFFT_R *) NFFT(malloc)((size_t)(2 * T * S) * (sizeof(NFFT_R))); if (x == NULL) return EXIT_FAILURE; w = (NFFT_R *) NFFT(malloc)((size_t)(T * S) * (sizeof(NFFT_R))); if (w == NULL) return EXIT_FAILURE; /** init two dimensional NFFT plan */ NFFT(init_guru)(&my_nfft_plan, 2, N, M, n, 4, PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT | FFT_OUT_OF_PLACE, FFTW_MEASURE | FFTW_DESTROY_INPUT); /** init two dimensional infft plan */ SOLVER(init_advanced_complex)(&my_infft_plan, (NFFT(mv_plan_complex)*) (&my_nfft_plan), CGNR | PRECOMPUTE_WEIGHT); /** init nodes and weights of grid*/ gridfcn(T, S, x, w); for (j = 0; j < my_nfft_plan.M_total; j++) { my_nfft_plan.x[2 * j + 0] = x[2 * j + 0]; my_nfft_plan.x[2 * j + 1] = x[2 * j + 1]; if (j % S) my_infft_plan.w[j] = w[j]; else my_infft_plan.w[j] = NFFT_K(0.0); } /** precompute psi, the entries of the matrix B */ if (my_nfft_plan.flags & PRE_LIN_PSI) NFFT(precompute_lin_psi)(&my_nfft_plan); if (my_nfft_plan.flags & PRE_PSI) NFFT(precompute_psi)(&my_nfft_plan); if (my_nfft_plan.flags & PRE_FULL_PSI) NFFT(precompute_full_psi)(&my_nfft_plan); /** compute 1D-ffts and init given samples and weights */ for (t = 0; t < T; t++) { /* for(r=0; r<R/2; r++) fft[r] = cexp(I*NFFT_KPI*r)*Rf[t*R+(r+R/2)]; for(r=0; r<R/2; r++) fft[r+R/2] = cexp(I*NFFT_KPI*r)*Rf[t*R+r]; */ for (r = 0; r < S; r++) fft[r] = Rf[t * S + r] + _Complex_I * NFFT_K(0.0); NFFT(fftshift_complex_int)(fft, 1, &S); FFTW(execute)(my_fftw_plan); NFFT(fftshift_complex_int)(fft, 1, &S); my_infft_plan.y[t * S] = NFFT_K(0.0); for (r = -S / 2 + 1; r < S / 2; r++) my_infft_plan.y[t * S + (r + S / 2)] = fft[r + S / 2] / KERNEL(r); } /** initialise some guess f_hat_0 */ for (k = 0; k < my_nfft_plan.N_total; k++) my_infft_plan.f_hat_iter[k] = NFFT_K(0.0) + _Complex_I * NFFT_K(0.0); /** solve the system */ SOLVER(before_loop_complex)(&my_infft_plan); if (max_i < 1) { l = 1; for (k = 0; k < my_nfft_plan.N_total; k++) my_infft_plan.f_hat_iter[k] = my_infft_plan.p_hat_iter[k]; } else { for (l = 1; l <= max_i; l++) { SOLVER(loop_one_step_complex)(&my_infft_plan); /*if (sqrt(my_infft_plan.dot_r_iter)<=1e-12) break;*/ } } /*printf("after %d iteration(s): weighted 2-norm of original residual vector = %g\n",l-1,sqrt(my_infft_plan.dot_r_iter));*/ /** copy result */ for (k = 0; k < my_nfft_plan.N_total; k++) f[k] = NFFT_M(creal)(my_infft_plan.f_hat_iter[k]); /** finalise the plans and free the variables */ FFTW(destroy_plan)(my_fftw_plan); NFFT(free)(fft); SOLVER(finalize_complex)(&my_infft_plan); NFFT(finalize)(&my_nfft_plan); NFFT(free)(x); NFFT(free)(w); return 0; }
int main(int argc, char **argv) { int n[3]; pfft_complex *in, *out; FFTW(plan) plan_forw=NULL, plan_back=NULL; double err, time, time_fftw[2], max_time_fftw[2]; unsigned fftw_flag; /* setup default parameters */ int iter = 10, inplace = 0, patience = 0; /* Set size of FFT and process mesh */ n[0] = n[1] = n[2] = 16; /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); pfft_init(); /* read parameters from command line */ init_parameters(argc, argv, n, &iter, &inplace, &patience); /* setup FFTWs planing depth */ switch(patience){ case 1: fftw_flag = FFTW_MEASURE; break; case 2: fftw_flag = FFTW_PATIENT; break; case 3: fftw_flag = FFTW_EXHAUSTIVE; break; default: fftw_flag = FFTW_ESTIMATE; } if(!inplace) fftw_flag |= FFTW_DESTROY_INPUT; /* Allocate memory */ in = pfft_alloc_complex(n[0]*n[1]*n[2]); out = (inplace) ? in : pfft_alloc_complex(n[0]*n[1]*n[2]); /* We often want to scale large FFTs, which do not fit on few processes. */ if( (in == NULL) || (out == NULL)){ fprintf(stderr, "!!! Error: Not enough memory to allocate input/output arrays !!!\n"); MPI_Finalize(); MPI_Finalize(); return 1; } ptrdiff_t local_ni[3], local_i_start[3], n_ptr[3]; for(int t=0; t<3; t++){ local_i_start[t] = 0; n_ptr[t] = local_ni[t] = (ptrdiff_t) n[t]; } plan_forw = FFTW(plan_dft_3d)(n[0], n[1], n[2], in, out, FFTW_FORWARD, fftw_flag); plan_back = FFTW(plan_dft_3d)(n[0], n[1], n[2], out, in, FFTW_BACKWARD, fftw_flag); /* Initialize input with random numbers */ pfft_init_input_complex_3d(n_ptr, local_ni, local_i_start, in); time_fftw[0] = time_fftw[1] = 0; for(int t=0; t<iter; t++){ /* execute parallel forward FFT */ time_fftw[0] -= MPI_Wtime(); FFTW(execute)(plan_forw); time_fftw[0] += MPI_Wtime(); /* execute parallel backward FFT */ time_fftw[1] -= MPI_Wtime(); FFTW(execute)(plan_back); time_fftw[1] += MPI_Wtime(); } /* Scale data */ for(int t=0; t<iter; t++) for(ptrdiff_t l=0; l < n[0] * n[1] * n[2]; l++) in[l] /= (n[0]*n[1]*n[2]); printf("fftw_forw = %.2e, fftw_back = %.2e\n", time_fftw[0]/iter, time_fftw[1]/iter); err = pfft_check_output_complex_3d(n_ptr, local_ni, local_i_start, in, MPI_COMM_WORLD); printf("Error after several forward and backward FFTWs of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); printf("maxerror = %6.2e;\n", err); /* free mem and finalize */ FFTW(destroy_plan)(plan_forw); FFTW(destroy_plan)(plan_back); pfft_free(in); if(!inplace) pfft_free(out); MPI_Finalize(); return 0; }
void after_problem_rcopy_from(bench_problem *p, bench_real *ri) { UNUSED(p); do_scatter_in(ri); if (plan_scramble_in) FFTW(execute)(plan_scramble_in); }