Exemplo n.º 1
0
void rdwisdom(void)
{
     FILE *f;
     double tim;
     int success = 0;

     if (havewisdom) return;

#ifdef HAVE_SMP
     BENCH_ASSERT(FFTW(init_threads)());
     FFTW(plan_with_nthreads)(nthreads);
#endif

     if (!usewisdom) return;

     timer_start(USER_TIMER);
     if ((f = fopen(wisdat, "r"))) {
	  if (!import_wisdom(f))
	       fprintf(stderr, "bench: ERROR reading wisdom\n");
	  else
	       success = 1;
	  fclose(f);
     }
     tim = timer_stop(USER_TIMER);

     if (success) {
	  if (verbose > 1) printf("READ WISDOM (%g seconds): ", tim);
	  
	  if (verbose > 3)
	       export_wisdom(stdout);
	  if (verbose > 1)
	       printf("\n");
     }
     havewisdom = 1;
}
Exemplo n.º 2
0
static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
{
     FFTW(plan) pln = 0;
     int i; 
     ptrdiff_t ntot;

     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;

     if (p->sz->rnk < 2
	 || p->split
	 || !tensor_real_contiguousp(p->sz, p->sign, vn)
	 || tensor_rowmajor_transposedp(p->sz)
	 || p->vecsz->rnk > 1
	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
				    || p->vecsz->dims[0].os != 1)))
	  return 0;

     alloc_rnk(p->sz->rnk);
     for (i = 0; i < rnk; ++i) {
	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
	  local_ni[i] = local_no[i] = total_ni[i];
	  local_starti[i] = local_starto[i] = 0;
     }
     local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] 
	  = p->sz->dims[rnk-1].n / 2 + 1;
     {
	  ptrdiff_t n, start, nT, startT;
	  ntot = FFTW(mpi_local_size_many_transposed)
	       (p->sz->rnk, total_ni, vn,
		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
		MPI_COMM_WORLD,
		&n, &start, &nT, &startT);
	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
	       local_ni[1] = nT;
	       local_starti[1] = startT;
	  }
	  else {
	       local_ni[0] = n;
	       local_starti[0] = start;
	  }
	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
	       local_no[1] = nT;
	       local_starto[1] = startT;
	  }
	  else {
	       local_no[0] = n;
	       local_starto[0] = start;
	  }
     }
     alloc_local(ntot * 2, p->in == p->out);

     total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
     if (p->sign < 0)
	  pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, 
					    FFTW_MPI_DEFAULT_BLOCK,
					    FFTW_MPI_DEFAULT_BLOCK,
					    local_in, 
					    (FFTW(complex) *) local_out,
					    MPI_COMM_WORLD, flags);
     else
Exemplo n.º 3
0
void useropt(const char *arg)
{
     int x;
     double y;

     if (!strcmp(arg, "patient")) the_flags |= FFTW_PATIENT;
     else if (!strcmp(arg, "estimate")) the_flags |= FFTW_ESTIMATE;
     else if (!strcmp(arg, "estimatepat")) the_flags |= FFTW_ESTIMATE_PATIENT;
     else if (!strcmp(arg, "exhaustive")) the_flags |= FFTW_EXHAUSTIVE;
     else if (!strcmp(arg, "unaligned")) the_flags |= FFTW_UNALIGNED;
     else if (!strcmp(arg, "nosimd")) the_flags |= FFTW_NO_SIMD;
     else if (!strcmp(arg, "noindirectop")) the_flags |= FFTW_NO_INDIRECT_OP;
     else if (!strcmp(arg, "wisdom-only")) the_flags |= FFTW_WISDOM_ONLY;
     else if (sscanf(arg, "flag=%d", &x) == 1) the_flags |= x;
     else if (sscanf(arg, "bflag=%d", &x) == 1) the_flags |= 1U << x;
     else if (!strcmp(arg, "paranoid")) paranoid = 1;
     else if (!strcmp(arg, "wisdom")) usewisdom = 1;
     else if (!strcmp(arg, "amnesia")) amnesia = 1;
     else if (sscanf(arg, "nthreads=%d", &x) == 1) nthreads = x;
#if HAVE_CELL
     else if (sscanf(arg, "nspe=%d", &x) == 1) FFTW(cell_set_nspe)(x);
#endif
     else if (sscanf(arg, "timelimit=%lg", &y) == 1) {
	  FFTW(set_timelimit)(y);
     }

     else fprintf(stderr, "unknown user option: %s.  Ignoring.\n", arg);
}
Exemplo n.º 4
0
void doit(int iter, bench_problem *p)
{
     int i;
     FFTW(plan) q = the_plan;

     UNUSED(p);
     for (i = 0; i < iter; ++i) 
	  FFTW(execute)(q);
}
Exemplo n.º 5
0
void done(bench_problem *p)
{
     UNUSED(p);

     FFTW(destroy_plan)(the_plan);
     uninstall_hook();
}
Exemplo n.º 6
0
void rdwisdom(void)
{
     FILE *f;
     double tim;
     int success = 0;

     if (havewisdom) return;

#ifdef HAVE_SMP
     if (threads_ok) {
	  BENCH_ASSERT(FFTW(init_threads)());
	  FFTW(plan_with_nthreads)(nthreads);
          FFTW(make_planner_thread_safe)();
#ifdef _OPENMP
	  omp_set_num_threads(nthreads);
#endif
     }
     else if (nthreads > 1 && verbose > 1) {
	  fprintf(stderr, "bench: WARNING - nthreads = %d, but threads not supported\n", nthreads);
	  nthreads = 1;
     }
#endif

     if (!usewisdom) return;

     timer_start(USER_TIMER);
     if ((f = fopen(wisdat, "r"))) {
	  if (!import_wisdom(f))
	       fprintf(stderr, "bench: ERROR reading wisdom\n");
	  else
	       success = 1;
	  fclose(f);
     }
     tim = timer_stop(USER_TIMER);

     if (success) {
	  if (verbose > 1) printf("READ WISDOM (%g seconds): ", tim);

	  if (verbose > 3)
	       export_wisdom(stdout);
	  if (verbose > 1)
	       printf("\n");
     }
     havewisdom = 1;
}
Exemplo n.º 7
0
void destroy_maxwell_data(maxwell_data *d)
{
     if (d) {
	  int i;

	  for (i = 0; i < d->nplans; ++i) {
#if defined(HAVE_FFTW3)
	       FFTW(destroy_plan)((fftplan) (d->plans[i]));
	       FFTW(destroy_plan)((fftplan) (d->iplans[i]));
#elif defined(HAVE_FFTW)
#  ifdef HAVE_MPI
#    ifdef SCALAR_COMPLEX
	       fftwnd_mpi_destroy_plan((fftplan) (d->plans[i]));
	       fftwnd_mpi_destroy_plan((fftplan) (d->iplans[i]));
#    else /* not SCALAR_COMPLEX */
	       rfftwnd_mpi_destroy_plan((fftplan) (d->plans[i]));
	       rfftwnd_mpi_destroy_plan((fftplan) (d->iplans[i]));
#    endif /* not SCALAR_COMPLEX */
#  else /* not HAVE_MPI */
#    ifdef SCALAR_COMPLEX
	       fftwnd_destroy_plan((fftplan) (d->plans[i]));
	       fftwnd_destroy_plan((fftplan) (d->iplans[i]));
#    else /* not SCALAR_COMPLEX */
	       rfftwnd_destroy_plan((fftplan) (d->plans[i]));
	       rfftwnd_destroy_plan((fftplan) (d->iplans[i]));
#    endif /* not SCALAR_COMPLEX */
#  endif /* not HAVE_MPI */
#endif /* HAVE FFTW */
	  }

	  free(d->eps_inv);
#if defined(HAVE_FFTW3)
	  FFTW(free)(d->fft_data);
	  if (d->fft_data2 != d->fft_data)
	       FFTW(free)(d->fft_data2);
#else
	  free(d->fft_data);
#endif
	  free(d->k_plus_G);
	  free(d->k_plus_G_normsqr);

	  free(d);
     }
}
Exemplo n.º 8
0
void cleanup(void)
{
     initial_cleanup();

     wrwisdom();
#ifdef HAVE_SMP
     FFTW(cleanup_threads)();
#else
     FFTW(cleanup)();
#endif

#    ifdef FFTW_DEBUG_MALLOC
     {
	  /* undocumented memory checker */
	  FFTW_EXTERN void FFTW(malloc_print_minfo)(int v);
	  FFTW(malloc_print_minfo)(verbose);
     }
#    endif

     final_cleanup();
}
Exemplo n.º 9
0
static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, 
					 ptrdiff_t vn, 
					 bench_real *in, bench_real *out)
{
     FFTW(iodim64) hdims[3];
     FFTW(r2r_kind) k[3];
     FFTW(plan) pln;

     hdims[0].n = nx;
     hdims[0].is = ny * vn;
     hdims[0].os = vn;
     hdims[1].n = ny;
     hdims[1].is = vn;
     hdims[1].os = nx * vn;
     hdims[2].n = vn;
     hdims[2].is = 1;
     hdims[2].os = 1;
     k[0] = k[1] = k[2] = FFTW_R2HC;
     pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
     BENCH_ASSERT(pln != 0);
     return pln;
}
Exemplo n.º 10
0
void setup(bench_problem *p)
{
     double tim;

     setup_sigfpe_handler();

     if (amnesia) {
	  FFTW(forget_wisdom)();
	  havewisdom = 0;
     }

     /* Regression test: check that fftw_malloc exists and links
      * properly */
     {
          void *ptr = FFTW(malloc(42));
          //BENCH_ASSERT(FFTW(alignment_of)(ptr) == 0);
          FFTW(free(ptr));
     }

     rdwisdom();
     install_hook();

#ifdef HAVE_SMP
     if (verbose > 1 && nthreads > 1) printf("NTHREADS = %d\n", nthreads);
#endif

     timer_start(USER_TIMER);
     the_plan = mkplan(p, preserve_input_flags(p) | the_flags);
     tim = timer_stop(USER_TIMER);
     if (verbose > 1) printf("planner time: %g s\n", tim);

     BENCH_ASSERT(the_plan);

     {
	  double add, mul, nfma, cost, pcost;
	  FFTW(flops)(the_plan, &add, &mul, &nfma);
	  cost = FFTW(estimate_cost)(the_plan);
	  pcost = FFTW(cost)(the_plan);
	  if (verbose > 1) {
	       FFTW(print_plan)(the_plan);
	       printf("\n");
	       printf("flops: %0.0f add, %0.0f mul, %0.0f fma\n",
		      add, mul, nfma);
	       printf("estimated cost: %f, pcost = %f\n", cost, pcost);
	  }
     }
}
Exemplo n.º 11
0
int can_do(bench_problem *p)
{
     double tim;

     if (verbose > 2 && p->pstring)
	  printf("Planning %s...\n", p->pstring);
     rdwisdom();

     timer_start(USER_TIMER);
     the_plan = mkplan(p, preserve_input_flags(p) | the_flags | FFTW_ESTIMATE);
     tim = timer_stop(USER_TIMER);
     if (verbose > 2) printf("estimate-planner time: %g s\n", tim);

     if (the_plan) {
	  FFTW(destroy_plan)(the_plan);
	  return 1;
     }
     return 0;
}
Exemplo n.º 12
0
static const char *mkversion(void) { return FFTW(version); }
Exemplo n.º 13
0
void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
{
     UNUSED(p);
     if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
     do_gather_out(ro);
}
Exemplo n.º 14
0
/* NxMxK the size of the data
 * comm communicator to use for fft5d
 * P0 number of processor in 1st axes (can be null for automatic)
 * lin is allocated by fft5d because size of array is only known after planning phase
 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 */
fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
{

    int        P[2], bMaster, prank[2], i, t;
    int        rNG, rMG, rKG;
    int       *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
    int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
    int        C[3], rC[3], nP[2];
    int        lsize;
    t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
    fft5d_plan plan;
    int        s;

    /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
#ifdef GMX_MPI
    if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
    {
        MPI_Comm_size(comm[0], &P[0]);
        MPI_Comm_rank(comm[0], &prank[0]);
    }
    else
#endif
    {
        P[0]     = 1;
        prank[0] = 0;
    }
#ifdef GMX_MPI
    if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
    {
        MPI_Comm_size(comm[1], &P[1]);
        MPI_Comm_rank(comm[1], &prank[1]);
    }
    else
#endif
    {
        P[1]     = 1;
        prank[1] = 0;
    }

    bMaster = (prank[0] == 0 && prank[1] == 0);


    if (debug)
    {
        fprintf(debug, "FFT5D: Using %dx%d processor grid, rank %d,%d\n",
                P[0], P[1], prank[0], prank[1]);
    }

    if (bMaster)
    {
        if (debug)
        {
            fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
                    NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
        }
        /* The check below is not correct, one prime factor 11 or 13 is ok.
           if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
            printf("WARNING: FFT very slow with prime factors larger 7\n");
            printf("Change FFT size or in case you cannot change it look at\n");
            printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
           }
         */
    }

    if (NG == 0 || MG == 0 || KG == 0)
    {
        if (bMaster)
        {
            printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
        }
        return 0;
    }

    rNG = NG; rMG = MG; rKG = KG;

    if (flags&FFT5D_REALCOMPLEX)
    {
        if (!(flags&FFT5D_BACKWARD))
        {
            NG = NG/2+1;
        }
        else
        {
            if (!(flags&FFT5D_ORDER_YZ))
            {
                MG = MG/2+1;
            }
            else
            {
                KG = KG/2+1;
            }
        }
    }


    /*for transpose we need to know the size for each processor not only our own size*/

    N0  = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
    M0  = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
    K0  = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
    oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
    oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
    oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));

    for (i = 0; i < P[0]; i++)
    {
        #define EVENDIST
        #ifndef EVENDIST
        oN0[i] = i*ceil((double)NG/P[0]);
        oM0[i] = i*ceil((double)MG/P[0]);
        oK0[i] = i*ceil((double)KG/P[0]);
        #else
        oN0[i] = (NG*i)/P[0];
        oM0[i] = (MG*i)/P[0];
        oK0[i] = (KG*i)/P[0];
        #endif
    }
    for (i = 0; i < P[1]; i++)
    {
        #ifndef EVENDIST
        oN1[i] = i*ceil((double)NG/P[1]);
        oM1[i] = i*ceil((double)MG/P[1]);
        oK1[i] = i*ceil((double)KG/P[1]);
        #else
        oN1[i] = (NG*i)/P[1];
        oM1[i] = (MG*i)/P[1];
        oK1[i] = (KG*i)/P[1];
        #endif
    }
    for (i = 0; i < P[0]-1; i++)
    {
        N0[i] = oN0[i+1]-oN0[i];
        M0[i] = oM0[i+1]-oM0[i];
        K0[i] = oK0[i+1]-oK0[i];
    }
    N0[P[0]-1] = NG-oN0[P[0]-1];
    M0[P[0]-1] = MG-oM0[P[0]-1];
    K0[P[0]-1] = KG-oK0[P[0]-1];
    for (i = 0; i < P[1]-1; i++)
    {
        N1[i] = oN1[i+1]-oN1[i];
        M1[i] = oM1[i+1]-oM1[i];
        K1[i] = oK1[i+1]-oK1[i];
    }
    N1[P[1]-1] = NG-oN1[P[1]-1];
    M1[P[1]-1] = MG-oM1[P[1]-1];
    K1[P[1]-1] = KG-oK1[P[1]-1];

    /* for step 1-3 the local N,M,K sizes of the transposed system
       C: contiguous dimension, and nP: number of processor in subcommunicator
       for that step */


    pM[0] = M0[prank[0]];
    oM[0] = oM0[prank[0]];
    pK[0] = K1[prank[1]];
    oK[0] = oK1[prank[1]];
    C[0]  = NG;
    rC[0] = rNG;
    if (!(flags&FFT5D_ORDER_YZ))
    {
        N[0]     = vmax(N1, P[1]);
        M[0]     = M0[prank[0]];
        K[0]     = vmax(K1, P[1]);
        pN[0]    = N1[prank[1]];
        iNout[0] = N1;
        oNout[0] = oN1;
        nP[0]    = P[1];
        C[1]     = KG;
        rC[1]    = rKG;
        N[1]     = vmax(K0, P[0]);
        pN[1]    = K0[prank[0]];
        iNin[1]  = K1;
        oNin[1]  = oK1;
        iNout[1] = K0;
        oNout[1] = oK0;
        M[1]     = vmax(M0, P[0]);
        pM[1]    = M0[prank[0]];
        oM[1]    = oM0[prank[0]];
        K[1]     = N1[prank[1]];
        pK[1]    = N1[prank[1]];
        oK[1]    = oN1[prank[1]];
        nP[1]    = P[0];
        C[2]     = MG;
        rC[2]    = rMG;
        iNin[2]  = M0;
        oNin[2]  = oM0;
        M[2]     = vmax(K0, P[0]);
        pM[2]    = K0[prank[0]];
        oM[2]    = oK0[prank[0]];
        K[2]     = vmax(N1, P[1]);
        pK[2]    = N1[prank[1]];
        oK[2]    = oN1[prank[1]];
        free(N0); free(oN0); /*these are not used for this order*/
        free(M1); free(oM1); /*the rest is freed in destroy*/
    }
    else
    {
        N[0]     = vmax(N0, P[0]);
        M[0]     = vmax(M0, P[0]);
        K[0]     = K1[prank[1]];
        pN[0]    = N0[prank[0]];
        iNout[0] = N0;
        oNout[0] = oN0;
        nP[0]    = P[0];
        C[1]     = MG;
        rC[1]    = rMG;
        N[1]     = vmax(M1, P[1]);
        pN[1]    = M1[prank[1]];
        iNin[1]  = M0;
        oNin[1]  = oM0;
        iNout[1] = M1;
        oNout[1] = oM1;
        M[1]     = N0[prank[0]];
        pM[1]    = N0[prank[0]];
        oM[1]    = oN0[prank[0]];
        K[1]     = vmax(K1, P[1]);
        pK[1]    = K1[prank[1]];
        oK[1]    = oK1[prank[1]];
        nP[1]    = P[1];
        C[2]     = KG;
        rC[2]    = rKG;
        iNin[2]  = K1;
        oNin[2]  = oK1;
        M[2]     = vmax(N0, P[0]);
        pM[2]    = N0[prank[0]];
        oM[2]    = oN0[prank[0]];
        K[2]     = vmax(M1, P[1]);
        pK[2]    = M1[prank[1]];
        oK[2]    = oM1[prank[1]];
        free(N1); free(oN1); /*these are not used for this order*/
        free(K0); free(oK0); /*the rest is freed in destroy*/
    }
    N[2] = pN[2] = -1;       /*not used*/

    /*
       Difference between x-y-z regarding 2d decomposition is whether they are
       distributed along axis 1, 2 or both
     */

    /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
    lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
    /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
    if (!(flags&FFT5D_NOMALLOC))
    {
        snew_aligned(lin, lsize, 32);
        snew_aligned(lout, lsize, 32);
        if (nthreads > 1)
        {
            /* We need extra transpose buffers to avoid OpenMP barriers */
            snew_aligned(lout2, lsize, 32);
            snew_aligned(lout3, lsize, 32);
        }
        else
        {
            /* We can reuse the buffers to avoid cache misses */
            lout2 = lin;
            lout3 = lout;
        }
    }
    else
    {
        lin  = *rlin;
        lout = *rlout;
        if (nthreads > 1)
        {
            lout2 = *rlout2;
            lout3 = *rlout3;
        }
        else
        {
            lout2 = lin;
            lout3 = lout;
        }
    }

    plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));


    if (debug)
    {
        fprintf(debug, "Running on %d threads\n", nthreads);
    }

#ifdef GMX_FFT_FFTW3                                                            /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
    /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
     * within a parallel region. For now deactivated. If it should be supported it has to made sure that
     * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
     * and that the 3d plan is faster than the 1d plan.
     */
    if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
    {
        int fftwflags = FFTW_DESTROY_INPUT;
        FFTW(iodim) dims[3];
        int inNG = NG, outMG = MG, outKG = KG;

        FFTW_LOCK;
        if (!(flags&FFT5D_NOMEASURE))
        {
            fftwflags |= FFTW_MEASURE;
        }
        if (flags&FFT5D_REALCOMPLEX)
        {
            if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
            {
                inNG *= 2;
            }
            else                                /*output pointer is not complex*/
            {
                if (!(flags&FFT5D_ORDER_YZ))
                {
                    outMG *= 2;
                }
                else
                {
                    outKG *= 2;
                }
            }
        }

        if (!(flags&FFT5D_BACKWARD))
        {
            dims[0].n  = KG;
            dims[1].n  = MG;
            dims[2].n  = rNG;

            dims[0].is = inNG*MG;         /*N M K*/
            dims[1].is = inNG;
            dims[2].is = 1;
            if (!(flags&FFT5D_ORDER_YZ))
            {
                dims[0].os = MG;           /*M K N*/
                dims[1].os = 1;
                dims[2].os = MG*KG;
            }
            else
            {
                dims[0].os = 1;           /*K N M*/
                dims[1].os = KG*NG;
                dims[2].os = KG;
            }
        }
        else
        {
            if (!(flags&FFT5D_ORDER_YZ))
            {
                dims[0].n  = NG;
                dims[1].n  = KG;
                dims[2].n  = rMG;

                dims[0].is = 1;
                dims[1].is = NG*MG;
                dims[2].is = NG;

                dims[0].os = outMG*KG;
                dims[1].os = outMG;
                dims[2].os = 1;
            }
            else
            {
                dims[0].n  = MG;
                dims[1].n  = NG;
                dims[2].n  = rKG;

                dims[0].is = NG;
                dims[1].is = 1;
                dims[2].is = NG*MG;

                dims[0].os = outKG*NG;
                dims[1].os = outKG;
                dims[2].os = 1;
            }
        }
#ifdef FFT5D_THREADS
#ifdef FFT5D_FFTW_THREADS
        FFTW(plan_with_nthreads)(nthreads);
#endif
#endif
        if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
        {
            plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
                                                         /*howmany*/ 0, /*howmany_dims*/ 0,
                                                         (real*)lin, (FFTW(complex) *) lout,
                                                         /*flags*/ fftwflags);
        }
Exemplo n.º 15
0
int main(int argc, char **argv)
{
  int n; /**< expansion degree        */
  int m; /**< cut-off parameter       */
  int p; /**< degree of smoothness    */
  char *s; /**< name of kernel          */
  C (*kernel)(R, int, const R *); /**< kernel function         */
  R c; /**< parameter for kernel    */
  R eps_I; /**< inner boundary          */
  R eps_B; /**< outer boundary          */

#ifdef _OPENMP
  int nthreads;

  if (argc != 9)
    return EXIT_FAILURE;

  nthreads = atoi(argv[8]);
  FFTW(init_threads)();
  omp_set_num_threads(nthreads);
#else
  if (argc != 8)
    return EXIT_FAILURE;
#endif

  n = atoi(argv[1]);
  m = atoi(argv[2]);
  p = atoi(argv[3]);
  s = argv[4];
  c = atof(argv[5]);
  eps_I = (R)(atof(argv[6]));
  eps_B = (R)(atof(argv[7]));
  if (strcmp(s, "gaussian") == 0)
    kernel = gaussian;
  else if (strcmp(s, "multiquadric") == 0)
    kernel = multiquadric;
  else if (strcmp(s, "inverse_multiquadric") == 0)
    kernel = inverse_multiquadric;
  else if (strcmp(s, "logarithm") == 0)
    kernel = logarithm;
  else if (strcmp(s, "thinplate_spline") == 0)
    kernel = thinplate_spline;
  else if (strcmp(s, "one_over_square") == 0)
    kernel = one_over_square;
  else if (strcmp(s, "one_over_modulus") == 0)
    kernel = one_over_modulus;
  else if (strcmp(s, "one_over_x") == 0)
    kernel = one_over_x;
  else if (strcmp(s, "inverse_multiquadric3") == 0)
    kernel = inverse_multiquadric3;
  else if (strcmp(s, "sinc_kernel") == 0)
    kernel = sinc_kernel;
  else if (strcmp(s, "cosc") == 0)
    kernel = cosc;
  else if (strcmp(s, "cot") == 0)
    kernel = kcot;
  else
  {
    s = "multiquadric";
    kernel = multiquadric;
  }

  bench_openmp(stdin, n, m, p, kernel, c, eps_I, eps_B);

  return EXIT_SUCCESS;
}
Exemplo n.º 16
0
static const char *mkcc(void) { return FFTW(cc); }
Exemplo n.º 17
0
static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
Exemplo n.º 18
0
static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
{
     FFTW(plan) pln = 0;
     int i; 
     ptrdiff_t ntot;

     vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;

     if (p->sz->rnk < 1
	 || p->split
	 || !tensor_contiguousp(p->sz, vn)
	 || tensor_rowmajor_transposedp(p->sz)
	 || p->vecsz->rnk > 1
	 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
				    || p->vecsz->dims[0].os != 1)))
	  return 0;

     alloc_rnk(p->sz->rnk);
     for (i = 0; i < rnk; ++i) {
	  total_ni[i] = total_no[i] = p->sz->dims[i].n;
	  local_ni[i] = local_no[i] = total_ni[i];
	  local_starti[i] = local_starto[i] = 0;
     }
     if (rnk > 1) {
	  ptrdiff_t n, start, nT, startT;
	  ntot = FFTW(mpi_local_size_many_transposed)
	       (p->sz->rnk, total_ni, vn,
		FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
		MPI_COMM_WORLD,
		&n, &start, &nT, &startT);
	  if  (flags & FFTW_MPI_TRANSPOSED_IN) {
	       local_ni[1] = nT;
	       local_starti[1] = startT;
	  }
	  else {
	       local_ni[0] = n;
	       local_starti[0] = start;
	  }
	  if  (flags & FFTW_MPI_TRANSPOSED_OUT) {
	       local_no[1] = nT;
	       local_starto[1] = startT;
	  }
	  else {
	       local_no[0] = n;
	       local_starto[0] = start;
	  }
     }
     else if (rnk == 1) {
	  ntot = FFTW(mpi_local_size_many_1d)
	       (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
		local_ni, local_starti, local_no, local_starto);
     }
     alloc_local(ntot * 2, p->in == p->out);

     pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, 
				   FFTW_MPI_DEFAULT_BLOCK,
				   FFTW_MPI_DEFAULT_BLOCK,
				   (FFTW(complex) *) local_in, 
				   (FFTW(complex) *) local_out,
				   MPI_COMM_WORLD, p->sign, flags);

     vn *= 2;

     if (rnk > 1) {
	  ptrdiff_t nrest = 1;
	  for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
	  if (flags & FFTW_MPI_TRANSPOSED_IN)
	       plan_scramble_in = mkplan_transpose_local(
		    p->sz->dims[0].n, local_ni[1], vn * nrest,
		    local_in, local_in);
	  if (flags & FFTW_MPI_TRANSPOSED_OUT)
	       plan_unscramble_out = mkplan_transpose_local(
		    local_no[1], p->sz->dims[0].n, vn * nrest,
		    local_out, local_out);
     }
     
     return pln;
}
Exemplo n.º 19
0
int main(int argc, char **argv)
{
  int j, k; /**< indices                 */
  int d; /**< number of dimensions    */
  int N; /**< number of source nodes  */
  int M; /**< number of target nodes  */
  int n; /**< expansion degree        */
  int m; /**< cut-off parameter       */
  int p; /**< degree of smoothness    */
  const char *s; /**< name of kernel          */
  C (*kernel)(R, int, const R *); /**< kernel function         */
  R c; /**< parameter for kernel    */
  fastsum_plan my_fastsum_plan; /**< plan for fast summation */
  C *direct; /**< array for direct computation */
  ticks t0, t1; /**< for time measurement    */
  R time; /**< for time measurement    */
  R error = K(0.0); /**< for error computation   */
  R eps_I; /**< inner boundary          */
  R eps_B; /**< outer boundary          */

  if (argc != 11)
  {
    printf("\nfastsum_test d N M n m p kernel c eps_I eps_B\n\n");
    printf("  d       dimension                 \n");
    printf("  N       number of source nodes    \n");
    printf("  M       number of target nodes    \n");
    printf("  n       expansion degree          \n");
    printf("  m       cut-off parameter         \n");
    printf("  p       degree of smoothness      \n");
    printf("  kernel  kernel function  (e.g., gaussian)\n");
    printf("  c       kernel parameter          \n");
    printf("  eps_I   inner boundary            \n");
    printf("  eps_B   outer boundary            \n\n");
    exit(EXIT_FAILURE);
  }
  else
  {
    d = atoi(argv[1]);
    N = atoi(argv[2]);
    c = K(1.0) / POW((R)(N), K(1.0) / ((R)(d)));
    M = atoi(argv[3]);
    n = atoi(argv[4]);
    m = atoi(argv[5]);
    p = atoi(argv[6]);
    s = argv[7];
    c = (R)(atof(argv[8]));
    eps_I = (R)(atof(argv[9]));
    eps_B = (R)(atof(argv[10]));
    if (strcmp(s, "gaussian") == 0)
      kernel = gaussian;
    else if (strcmp(s, "multiquadric") == 0)
      kernel = multiquadric;
    else if (strcmp(s, "inverse_multiquadric") == 0)
      kernel = inverse_multiquadric;
    else if (strcmp(s, "logarithm") == 0)
      kernel = logarithm;
    else if (strcmp(s, "thinplate_spline") == 0)
      kernel = thinplate_spline;
    else if (strcmp(s, "one_over_square") == 0)
      kernel = one_over_square;
    else if (strcmp(s, "one_over_modulus") == 0)
      kernel = one_over_modulus;
    else if (strcmp(s, "one_over_x") == 0)
      kernel = one_over_x;
    else if (strcmp(s, "inverse_multiquadric3") == 0)
      kernel = inverse_multiquadric3;
    else if (strcmp(s, "sinc_kernel") == 0)
      kernel = sinc_kernel;
    else if (strcmp(s, "cosc") == 0)
      kernel = cosc;
    else if (strcmp(s, "cot") == 0)
      kernel = kcot;
    else
    {
      s = "multiquadric";
      kernel = multiquadric;
    }
  }
  printf(
      "d=%d, N=%d, M=%d, n=%d, m=%d, p=%d, kernel=%s, c=%" __FGS__ ", eps_I=%" __FGS__ ", eps_B=%" __FGS__ " \n",
      d, N, M, n, m, p, s, c, eps_I, eps_B);
#ifdef NF_KUB
  printf("nearfield correction using piecewise cubic Lagrange interpolation\n");
#elif defined(NF_QUADR)
  printf("nearfield correction using piecewise quadratic Lagrange interpolation\n");
#elif defined(NF_LIN)
  printf("nearfield correction using piecewise linear Lagrange interpolation\n");
#endif

#ifdef _OPENMP
#pragma omp parallel
  {
#pragma omp single
    {
      printf("nthreads=%d\n", omp_get_max_threads());
    }
  }

  FFTW(init_threads)();
#endif

  /** init d-dimensional fastsum plan */
  fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, 0, n, m, p, eps_I,
      eps_B);
  //fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, NEARFIELD_BOXES, n, m, p, eps_I, eps_B);

  if (my_fastsum_plan.flags & NEARFIELD_BOXES)
    printf(
        "determination of nearfield candidates based on partitioning into boxes\n");
  else
    printf("determination of nearfield candidates based on search tree\n");

  /** init source knots in a d-ball with radius 0.25-eps_b/2 */
  k = 0;
  while (k < N)
  {
    R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0);
    R r2 = K(0.0);

    for (j = 0; j < d; j++)
      my_fastsum_plan.x[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max;

    for (j = 0; j < d; j++)
      r2 += my_fastsum_plan.x[k * d + j] * my_fastsum_plan.x[k * d + j];

    if (r2 >= r_max * r_max)
      continue;

    k++;
  }

  for (k = 0; k < N; k++)
  {
    /*    R r=(0.25-my_fastsum_plan.eps_B/2.0)*pow((R)rand()/(R)RAND_MAX,1.0/d);
     my_fastsum_plan.x[k*d+0] = r;
     for (j=1; j<d; j++)
     {
     R phi=2.0*KPI*(R)rand()/(R)RAND_MAX;
     my_fastsum_plan.x[k*d+j] = r;
     for (t=0; t<j; t++)
     {
     my_fastsum_plan.x[k*d+t] *= cos(phi);
     }
     my_fastsum_plan.x[k*d+j] *= sin(phi);
     }
     */
    my_fastsum_plan.alpha[k] = NFFT(drand48)() + II * NFFT(drand48)();
  }

  /** init target knots in a d-ball with radius 0.25-eps_b/2 */
  k = 0;
  while (k < M)
  {
    R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0);
    R r2 = K(0.0);

    for (j = 0; j < d; j++)
      my_fastsum_plan.y[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max;

    for (j = 0; j < d; j++)
      r2 += my_fastsum_plan.y[k * d + j] * my_fastsum_plan.y[k * d + j];

    if (r2 >= r_max * r_max)
      continue;

    k++;
  }
  /*  for (k=0; k<M; k++)
   {
   R r=(0.25-my_fastsum_plan.eps_B/2.0)*pow((R)rand()/(R)RAND_MAX,1.0/d);
   my_fastsum_plan.y[k*d+0] = r;
   for (j=1; j<d; j++)
   {
   R phi=2.0*KPI*(R)rand()/(R)RAND_MAX;
   my_fastsum_plan.y[k*d+j] = r;
   for (t=0; t<j; t++)
   {
   my_fastsum_plan.y[k*d+t] *= cos(phi);
   }
   my_fastsum_plan.y[k*d+j] *= sin(phi);
   }
   } */

  /** direct computation */
  printf("direct computation: ");
  fflush(NULL);
  t0 = getticks();
  fastsum_exact(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** copy result */
  direct = (C *) NFFT(malloc)((size_t)(my_fastsum_plan.M_total) * (sizeof(C)));
  for (j = 0; j < my_fastsum_plan.M_total; j++)
    direct[j] = my_fastsum_plan.f[j];

  /** precomputation */
  printf("pre-computation:    ");
  fflush(NULL);
  t0 = getticks();
  fastsum_precompute(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** fast computation */
  printf("fast computation:   ");
  fflush(NULL);
  t0 = getticks();
  fastsum_trafo(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** compute max error */
  error = K(0.0);
  for (j = 0; j < my_fastsum_plan.M_total; j++)
  {
    if (CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]) > error)
      error = CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]);
  }
  printf("max relative error: %" __FES__ "\n", error);

  /** finalise the plan */
  fastsum_finalize(&my_fastsum_plan);

  return EXIT_SUCCESS;
}
Exemplo n.º 20
0
maxwell_data *create_maxwell_data(int nx, int ny, int nz,
				  int *local_N, int *N_start, int *alloc_N,
				  int num_bands,
				  int max_fft_bands)
{
     int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3;
     maxwell_data *d = 0;
     int fft_data_size;

     n[0] = nx;
     n[1] = ny;
     n[2] = nz;

#if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3)
#  error Non-FFTW FFTs are not currently supported.
#endif
     

#if defined(HAVE_FFTW)
     CHECK(sizeof(fftw_real) == sizeof(real),
	   "floating-point type is inconsistent with FFTW!");
#endif

     CHK_MALLOC(d, maxwell_data, 1);

     d->nx = nx;
     d->ny = ny;
     d->nz = nz;
     
     d->max_fft_bands = MIN2(num_bands, max_fft_bands);
     maxwell_set_num_bands(d, num_bands);

     d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0;
     d->parity = NO_PARITY;

     d->last_dim_size = d->last_dim = n[rank - 1];

     /* ----------------------------------------------------- */
     d->nplans = 1;
#ifndef HAVE_MPI 
     d->local_nx = nx; d->local_ny = ny;
     d->local_x_start = d->local_y_start = 0;
     *local_N = *alloc_N = nx * ny * nz;
     *N_start = 0;
     d->other_dims = *local_N / d->last_dim;

     d->fft_data = 0;  /* initialize it here for use in specific planner? */

#  if defined(HAVE_FFTW3)
     d->nplans = 0; /* plans will be created as needed */
#    ifdef SCALAR_COMPLEX
     d->fft_output_size = fft_data_size = nx * ny * nz;
#    else
     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2;
#    endif

#  elif defined(HAVE_FFTW)
#    ifdef SCALAR_COMPLEX
     d->fft_output_size = fft_data_size = nx * ny * nz;
     d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD,
					   FFTW_ESTIMATE | FFTW_IN_PLACE,
					   (fftw_complex*) d->fft_data,
					   3 * d->num_fft_bands,
					   (fftw_complex*) d->fft_data,
					   3 * d->num_fft_bands);
     d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD,
					    FFTW_ESTIMATE | FFTW_IN_PLACE,
					    (fftw_complex*) d->fft_data,
					    3 * d->num_fft_bands,
					    (fftw_complex*) d->fft_data,
					    3 * d->num_fft_bands);
#    else /* not SCALAR_COMPLEX */
     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2;
     d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL,
					    FFTW_ESTIMATE | FFTW_IN_PLACE,
					    (fftw_real*) d->fft_data,
					    3 * d->num_fft_bands,
					    (fftw_real*) d->fft_data,
					    3 * d->num_fft_bands);
     d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX,
					     FFTW_ESTIMATE | FFTW_IN_PLACE,
					     (fftw_real*) d->fft_data,
					     3 * d->num_fft_bands,
					     (fftw_real*) d->fft_data,
					     3 * d->num_fft_bands);
#    endif /* not SCALAR_COMPLEX */
#  endif /* HAVE_FFTW */

#else /* HAVE_MPI */
     /* ----------------------------------------------------- */

#  if defined(HAVE_FFTW3)
{
     int i;
     ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start;

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

     d->nplans = 0; /* plans will be created as needed */

     for (i = 0; i < rank; ++i) np[i] = n[i];
     
#    ifndef SCALAR_COMPLEX
     d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1);
#    endif

     fft_data_size = *alloc_N 
	  = FFTW(mpi_local_size_transposed)(rank, np, MPI_COMM_WORLD,
					    &local_nx, &local_x_start,
					    &local_ny, &local_y_start);
#    ifndef SCALAR_COMPLEX
     fft_data_size = (*alloc_N *= 2); // convert to # of real scalars
#    endif

     d->local_nx = local_nx;
     d->local_x_start = local_x_start;
     d->local_ny = local_ny;
     d->local_y_start = local_y_start;

     d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz);
     *local_N = d->local_nx * ny * nz;
     *N_start = d->local_x_start * ny * nz;
     d->other_dims = *local_N / d->last_dim;
}
#  elif defined(HAVE_FFTW)

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

#    ifdef SCALAR_COMPLEX
     d->iplans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
				       FFTW_FORWARD,
				       FFTW_ESTIMATE | FFTW_IN_PLACE);
     {
	  int nt[3]; /* transposed dimensions for reverse FFT */
	  nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; 
	  d->plans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, nt,
					   FFTW_BACKWARD,
					   FFTW_ESTIMATE | FFTW_IN_PLACE);
     }

     fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start,
			    &d->local_ny, &d->local_y_start,
			    &fft_data_size);
     
     d->fft_output_size = nx * d->local_ny * nz;

#    else /* not SCALAR_COMPLEX */

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

     d->iplans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
					FFTW_REAL_TO_COMPLEX,
					FFTW_ESTIMATE | FFTW_IN_PLACE);

     /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for
	the reverse transform here--we always pass the dimensions of the
	original real array, and rfftwnd_mpi assumes that if one
	transform is transposed, then the other is as well. */
     d->plans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
				       FFTW_COMPLEX_TO_REAL,
				       FFTW_ESTIMATE | FFTW_IN_PLACE);

     rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start,
			     &d->local_ny, &d->local_y_start,
			     &fft_data_size);

     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     if (rank == 2)
	  d->fft_output_size = nx * d->local_ny * nz;
     else
	  d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2);

#    endif /* not SCALAR_COMPLEX */
     
     *local_N = d->local_nx * ny * nz;
     *N_start = d->local_x_start * ny * nz;
     *alloc_N = *local_N;
     d->other_dims = *local_N / d->last_dim;

#  endif /* HAVE_FFTW */

#endif /* HAVE_MPI */
     /* ----------------------------------------------------- */

#ifdef HAVE_FFTW
     CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed");
#endif

     CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size);

     /* A scratch output array is required because the "ordinary" arrays
	are not in a cartesian basis (or even a constant basis). */
     fft_data_size *= d->max_fft_bands;
#if defined(HAVE_FFTW3)
     d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size);
     CHECK(d->fft_data, "out of memory!");
     d->fft_data2 = d->fft_data; /* works in-place */
#else     
     CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size);
     d->fft_data2 = d->fft_data; /* works in-place */
#endif

     CHK_MALLOC(d->k_plus_G, k_data, *local_N);
     CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N);

     d->eps_inv_mean = 1.0;

     d->local_N = *local_N;
     d->N_start = *N_start;
     d->alloc_N = *alloc_N;
     d->N = nx * ny * nz;

     return d;
}
Exemplo n.º 21
0
int bench_openmp(FILE *infile, int n, int m, int p,
    C (*kernel)(R, int, const R *), R c, R eps_I, R eps_B)
{
  fastsum_plan my_fastsum_plan;
  int d, L, M;
  int t, j;
  R re, im;
  R r_max = K(0.25) - my_fastsum_plan.eps_B / K(2.0);
  ticks t0, t1;
  R tt_total;

  fscanf(infile, "%d %d %d", &d, &L, &M);

#ifdef _OPENMP
  FFTW(import_wisdom_from_filename)("fastsum_benchomp_detail_threads.plan");
#else
  FFTW(import_wisdom_from_filename)("fastsum_benchomp_detail_single.plan");
#endif

  fastsum_init_guru(&my_fastsum_plan, d, L, M, kernel, &c, NEARFIELD_BOXES, n,
      m, p, eps_I, eps_B);

#ifdef _OPENMP
  FFTW(export_wisdom_to_filename)("fastsum_benchomp_detail_threads.plan");
#else
  FFTW(export_wisdom_to_filename)("fastsum_benchomp_detail_single.plan");
#endif

  for (j = 0; j < L; j++)
  {
    for (t = 0; t < d; t++)
    {
      R v;
      fscanf(infile, __FR__, &v);
      my_fastsum_plan.x[d * j + t] = v * r_max;
    }
  }

  for (j = 0; j < L; j++)
  {
    fscanf(infile, __FR__ " " __FR__, &re, &im);
    my_fastsum_plan.alpha[j] = re + II * im;
  }

  for (j = 0; j < M; j++)
  {
    for (t = 0; t < d; t++)
    {
      R v;
      fscanf(infile, __FR__, &v);
      my_fastsum_plan.y[d * j + t] = v * r_max;
    }
  }

  /** precomputation */
  t0 = getticks();
  fastsum_precompute(&my_fastsum_plan);

  /** fast computation */
  fastsum_trafo(&my_fastsum_plan);
  t1 = getticks();
  tt_total = NFFT(elapsed_seconds)(t1, t0);

#ifndef MEASURE_TIME
  my_fastsum_plan.MEASURE_TIME_t[0] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[1] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[2] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[3] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[4] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[5] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[6] = K(0.0);
  my_fastsum_plan.MEASURE_TIME_t[7] = K(0.0);
  my_fastsum_plan.mv1.MEASURE_TIME_t[0] = K(0.0);
  my_fastsum_plan.mv1.MEASURE_TIME_t[2] = K(0.0);
  my_fastsum_plan.mv2.MEASURE_TIME_t[0] = K(0.0);
  my_fastsum_plan.mv2.MEASURE_TIME_t[2] = K(0.0);
#endif
#ifndef MEASURE_TIME_FFTW
  my_fastsum_plan.mv1.MEASURE_TIME_t[1] = K(0.0);
  my_fastsum_plan.mv2.MEASURE_TIME_t[1] = K(0.0);
#endif

  printf(
      "%.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ " %.6" __FES__ " %.6" __FES__ " %6" __FES__ "\n",
      my_fastsum_plan.MEASURE_TIME_t[0], my_fastsum_plan.MEASURE_TIME_t[1],
      my_fastsum_plan.MEASURE_TIME_t[2], my_fastsum_plan.MEASURE_TIME_t[3],
      my_fastsum_plan.MEASURE_TIME_t[4], my_fastsum_plan.MEASURE_TIME_t[5],
      my_fastsum_plan.MEASURE_TIME_t[6], my_fastsum_plan.MEASURE_TIME_t[7],
      tt_total - my_fastsum_plan.MEASURE_TIME_t[0]
          - my_fastsum_plan.MEASURE_TIME_t[1]
          - my_fastsum_plan.MEASURE_TIME_t[2]
          - my_fastsum_plan.MEASURE_TIME_t[3]
          - my_fastsum_plan.MEASURE_TIME_t[4]
          - my_fastsum_plan.MEASURE_TIME_t[5]
          - my_fastsum_plan.MEASURE_TIME_t[6]
          - my_fastsum_plan.MEASURE_TIME_t[7], tt_total,
      my_fastsum_plan.mv1.MEASURE_TIME_t[0],
      my_fastsum_plan.mv1.MEASURE_TIME_t[1],
      my_fastsum_plan.mv1.MEASURE_TIME_t[2],
      my_fastsum_plan.mv2.MEASURE_TIME_t[0],
      my_fastsum_plan.mv2.MEASURE_TIME_t[1],
      my_fastsum_plan.mv2.MEASURE_TIME_t[2]);

  fastsum_finalize(&my_fastsum_plan);

  return 0;
}
Exemplo n.º 22
0
/** computes the inverse discrete Radon transform of Rf
 *  on the grid given by gridfcn() with T angles and R offsets
 *  by a NFFT-based CG-type algorithm
 */
static int inverse_radon_trafo(int (*gridfcn)(), int T, int S, NFFT_R *Rf, int NN, NFFT_R *f,
    int max_i)
{
  int j, k; /**< index for nodes and freqencies   */
  NFFT(plan) my_nfft_plan; /**< plan for the nfft-2D             */
  SOLVER(plan_complex) my_infft_plan; /**< plan for the inverse nfft        */

  NFFT_C *fft; /**< variable for the fftw-1Ds        */
  FFTW(plan) my_fftw_plan; /**< plan for the fftw-1Ds            */

  int t, r; /**< index for directions and offsets */
  NFFT_R *x, *w; /**< knots and associated weights     */
  int l; /**< index for iterations             */

  int N[2], n[2];
  int M = T * S;

  N[0] = NN;
  n[0] = 2 * N[0];
  N[1] = NN;
  n[1] = 2 * N[1];

  fft = (NFFT_C *) NFFT(malloc)((size_t)(S) * sizeof(NFFT_C));
  my_fftw_plan = FFTW(plan_dft_1d)(S, fft, fft, FFTW_FORWARD, FFTW_MEASURE);

  x = (NFFT_R *) NFFT(malloc)((size_t)(2 * T * S) * (sizeof(NFFT_R)));
  if (x == NULL)
    return EXIT_FAILURE;

  w = (NFFT_R *) NFFT(malloc)((size_t)(T * S) * (sizeof(NFFT_R)));
  if (w == NULL)
    return EXIT_FAILURE;

  /** init two dimensional NFFT plan */
  NFFT(init_guru)(&my_nfft_plan, 2, N, M, n, 4,
      PRE_PHI_HUT | PRE_PSI | MALLOC_X | MALLOC_F_HAT | MALLOC_F | FFTW_INIT
          | FFT_OUT_OF_PLACE,
      FFTW_MEASURE | FFTW_DESTROY_INPUT);

  /** init two dimensional infft plan */
  SOLVER(init_advanced_complex)(&my_infft_plan,
      (NFFT(mv_plan_complex)*) (&my_nfft_plan), CGNR | PRECOMPUTE_WEIGHT);

  /** init nodes and weights of grid*/
  gridfcn(T, S, x, w);
  for (j = 0; j < my_nfft_plan.M_total; j++)
  {
    my_nfft_plan.x[2 * j + 0] = x[2 * j + 0];
    my_nfft_plan.x[2 * j + 1] = x[2 * j + 1];
    if (j % S)
      my_infft_plan.w[j] = w[j];
    else
      my_infft_plan.w[j] = NFFT_K(0.0);
  }

  /** precompute psi, the entries of the matrix B */
  if (my_nfft_plan.flags & PRE_LIN_PSI)
    NFFT(precompute_lin_psi)(&my_nfft_plan);

  if (my_nfft_plan.flags & PRE_PSI)
    NFFT(precompute_psi)(&my_nfft_plan);

  if (my_nfft_plan.flags & PRE_FULL_PSI)
    NFFT(precompute_full_psi)(&my_nfft_plan);

  /** compute 1D-ffts and init given samples and weights */
  for (t = 0; t < T; t++)
  {
    /*    for(r=0; r<R/2; r++)
     fft[r] = cexp(I*NFFT_KPI*r)*Rf[t*R+(r+R/2)];
     for(r=0; r<R/2; r++)
     fft[r+R/2] = cexp(I*NFFT_KPI*r)*Rf[t*R+r];
     */

    for (r = 0; r < S; r++)
      fft[r] = Rf[t * S + r] + _Complex_I * NFFT_K(0.0);

    NFFT(fftshift_complex_int)(fft, 1, &S);
    FFTW(execute)(my_fftw_plan);
    NFFT(fftshift_complex_int)(fft, 1, &S);

    my_infft_plan.y[t * S] = NFFT_K(0.0);
    for (r = -S / 2 + 1; r < S / 2; r++)
      my_infft_plan.y[t * S + (r + S / 2)] = fft[r + S / 2] / KERNEL(r);
  }

  /** initialise some guess f_hat_0 */
  for (k = 0; k < my_nfft_plan.N_total; k++)
    my_infft_plan.f_hat_iter[k] = NFFT_K(0.0) + _Complex_I * NFFT_K(0.0);

  /** solve the system */
  SOLVER(before_loop_complex)(&my_infft_plan);

  if (max_i < 1)
  {
    l = 1;
    for (k = 0; k < my_nfft_plan.N_total; k++)
      my_infft_plan.f_hat_iter[k] = my_infft_plan.p_hat_iter[k];
  }
  else
  {
    for (l = 1; l <= max_i; l++)
    {
      SOLVER(loop_one_step_complex)(&my_infft_plan);
      /*if (sqrt(my_infft_plan.dot_r_iter)<=1e-12) break;*/
    }
  }
  /*printf("after %d iteration(s): weighted 2-norm of original residual vector = %g\n",l-1,sqrt(my_infft_plan.dot_r_iter));*/

  /** copy result */
  for (k = 0; k < my_nfft_plan.N_total; k++)
    f[k] = NFFT_M(creal)(my_infft_plan.f_hat_iter[k]);

  /** finalise the plans and free the variables */
  FFTW(destroy_plan)(my_fftw_plan);
  NFFT(free)(fft);
  SOLVER(finalize_complex)(&my_infft_plan);
  NFFT(finalize)(&my_nfft_plan);
  NFFT(free)(x);
  NFFT(free)(w);
  return 0;
}
Exemplo n.º 23
0
int main(int argc, char **argv)
{
  int n[3];
  pfft_complex *in, *out;
  FFTW(plan) plan_forw=NULL, plan_back=NULL;
  double err, time, time_fftw[2], max_time_fftw[2];
  unsigned fftw_flag;

  /* setup default parameters */
  int iter = 10, inplace = 0, patience = 0;  
  
  /* Set size of FFT and process mesh */
  n[0] = n[1] = n[2] = 16;
 
  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* read parameters from command line */
  init_parameters(argc, argv, n, &iter, &inplace, &patience);

  /* setup FFTWs planing depth */  
  switch(patience){
    case 1: fftw_flag = FFTW_MEASURE; break;
    case 2: fftw_flag = FFTW_PATIENT; break;
    case 3: fftw_flag = FFTW_EXHAUSTIVE; break;
    default: fftw_flag = FFTW_ESTIMATE;
  }
  
  if(!inplace)
    fftw_flag |= FFTW_DESTROY_INPUT;

  /* Allocate memory */
  in = pfft_alloc_complex(n[0]*n[1]*n[2]);
  out = (inplace) ? in : pfft_alloc_complex(n[0]*n[1]*n[2]);

  /* We often want to scale large FFTs, which do not fit on few processes. */
  if( (in == NULL) || (out == NULL)){
    fprintf(stderr, "!!! Error: Not enough memory to allocate input/output arrays !!!\n");
    MPI_Finalize();
    MPI_Finalize();
    return 1;
  }

  ptrdiff_t local_ni[3], local_i_start[3], n_ptr[3];
  for(int t=0; t<3; t++){
    local_i_start[t] = 0;
    n_ptr[t] = local_ni[t] = (ptrdiff_t) n[t];
  }
  
  plan_forw = FFTW(plan_dft_3d)(n[0], n[1], n[2], in, out, FFTW_FORWARD, fftw_flag);
  plan_back = FFTW(plan_dft_3d)(n[0], n[1], n[2], out, in, FFTW_BACKWARD, fftw_flag);
  
  /* Initialize input with random numbers */
  pfft_init_input_complex_3d(n_ptr, local_ni, local_i_start,
      in);

  time_fftw[0] = time_fftw[1] = 0;
  for(int t=0; t<iter; t++){
    /* execute parallel forward FFT */
    time_fftw[0] -= MPI_Wtime();
    FFTW(execute)(plan_forw);
    time_fftw[0] += MPI_Wtime();
  
    /* execute parallel backward FFT */
    time_fftw[1] -= MPI_Wtime();
    FFTW(execute)(plan_back);
    time_fftw[1] += MPI_Wtime();
  }
  
  /* Scale data */
  for(int t=0; t<iter; t++)
    for(ptrdiff_t l=0; l < n[0] * n[1] * n[2]; l++)
      in[l] /= (n[0]*n[1]*n[2]);

  printf("fftw_forw = %.2e, fftw_back = %.2e\n", time_fftw[0]/iter, time_fftw[1]/iter);
 
  err = pfft_check_output_complex_3d(n_ptr, local_ni, local_i_start, in, MPI_COMM_WORLD);
  printf("Error after several forward and backward FFTWs of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); 
  printf("maxerror = %6.2e;\n", err);
  
  /* free mem and finalize */
  FFTW(destroy_plan)(plan_forw);
  FFTW(destroy_plan)(plan_back);
  pfft_free(in); if(!inplace) pfft_free(out);
 
  MPI_Finalize();
  return 0;
}
Exemplo n.º 24
0
void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
{
     UNUSED(p);
     do_scatter_in(ri);
     if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
}