示例#1
0
 void bspfft1d_init(int n1, int N, int s, int t, double *w0, double *w, double *tw,
 int *rho_np, int *rho_p){
   
   /* This parallel function initializes all the tables used in the FFT. */
   
   int nlc, k1, ntw, c;
   double alpha;
   
   nlc= nloc(N,t,n1);
   bitrev_init(nlc,rho_np);
   bitrev_init(N,rho_p);
   
   k1= k1_init(n1,N,nlc);
   ufft_init(k1,w0);
   ufft_init(nlc,w);
   
   ntw= 0;
   for (c=k1; c<=N; c *=nlc){
     alpha= (t%c) / (double)(c);
     twiddle_init(nlc,alpha,rho_np,&tw[2*ntw*nlc]);
     ntw++;
   }
   
 } /* end bspfft_init */
示例#2
0
int dfft_create_plan_common(dfft_plan *p,
    int ndim, int *gdim,
    int *inembed, int *oembed,
    int *pdim, int *pidx, int row_m,
    int input_cyclic, int output_cyclic,
    MPI_Comm comm,
    int *proc_map,
    int device)
    {
    int nump;

    p->comm = comm;

    MPI_Comm_size(comm,&nump);

    /* number of processor must be power of two */
    if (nump & (nump-1)) return 4;

    /* Allocate memory for processor map and copy over */
    p->proc_map = malloc(sizeof(int)*nump);
    memcpy(p->proc_map, proc_map, sizeof(int)*nump);

    p->pdim = malloc(ndim*sizeof(int));
    p->gdim = malloc(ndim*sizeof(int));
    p->pidx = malloc(ndim*sizeof(int));

    p->inembed = malloc(ndim*sizeof(int));
    p->oembed = malloc(ndim*sizeof(int));

    p->ndim = ndim;

    int i;
    for (i = 0; i < ndim; i++)
        {
        p->gdim[i] = gdim[i];

        /* Every dimension must be a power of two */
        if (gdim[i] & (gdim[i]-1)) return 5;

        p->pdim[i] = pdim[i];
        }

    if (inembed != NULL)
        {
        for (i = 0; i < ndim; i++)
            p->inembed[i] = inembed[i];
        }
    else
        {
        for (i = 0; i < ndim; i++)
            p->inembed[i] = p->gdim[i]/p->pdim[i];
        }

    if (oembed != NULL)
        {
        for (i = 0; i < ndim; i++)
            p->oembed[i] = oembed[i];
        }
    else
        {
        for (i = 0; i < ndim; i++)
            p->oembed[i] = p->gdim[i]/p->pdim[i];
        }

    p->offset_send = (int *)malloc(sizeof(int)*nump);
    p->offset_recv = (int *)malloc(sizeof(int)*nump);
    p->nsend = (int *)malloc(sizeof(int)*nump);
    p->nrecv = (int *)malloc(sizeof(int)*nump);

    if (!device)
        {
        #ifdef ENABLE_HOST
        p->plans_short_forward = malloc(sizeof(plan_t)*ndim);
        p->plans_long_forward = malloc(sizeof(plan_t)*ndim);
        p->plans_short_inverse = malloc(sizeof(plan_t)*ndim);
        p->plans_long_inverse = malloc(sizeof(plan_t)*ndim);
        #else
        return 3;
        #endif
        }

    /* local problem size */
    int size_in = 1;
    int size_out = 1;

    /* since we expect column-major input, the leading dimension
      has no embedding */
    p->inembed[0] = gdim[0]/pdim[0];
    p->oembed[0] = gdim[0]/pdim[0];

    for (i = 0; i < ndim ; ++i)
        {
        size_in *= p->inembed[i];
        size_out *= p->oembed[i];
        }

    p->size_in = size_in;
    p->size_out = size_out;

    /* find length k0 of last stage of butterflies */
    p->k0 = malloc(sizeof(int)*ndim);

    for (i = 0; i< ndim; ++i)
        {
        int length = gdim[i]/pdim[i];
        if (length > 1)
            {
            int c;
            for (c=gdim[i]; c>length; c /= length)
                ;
            p->k0[i] = c;
            }
        else
            {
            p->k0[i] = 1;
            }
        }

    p->rho_L = (int **)malloc(ndim*sizeof(int *));
    p->rho_pk0= (int **)malloc(ndim*sizeof(int *));
    p->rho_Lk0 = (int **)malloc(ndim*sizeof(int *));

    for (i = 0; i < ndim; ++i)
        {
        int length = gdim[i]/pdim[i];
        p->rho_L[i] = (int *) malloc(sizeof(int)*length);
        p->rho_pk0[i] = (int *) malloc(sizeof(int)*pdim[i]/(p->k0[i]));
        p->rho_Lk0[i] = (int *) malloc(sizeof(int)*length/(p->k0[i]));
        bitrev_init(length, p->rho_L[i]);
        bitrev_init(pdim[i]/(p->k0[i]),p->rho_pk0[i]);
        bitrev_init(length/(p->k0[i]),p->rho_Lk0[i]);
        }

    /* processor coordinates */
    for (i = 0; i < ndim; ++i)
        {
        p->pidx[i] = pidx[i];
        }

    /* init local FFT library */
    int res;
    if (!device)
        {
        #ifdef ENABLE_HOST
        res = dfft_init_local_fft();
        #else
        return 3;
        #endif
        }
    else
        {
        #ifdef ENABLE_CUDA
        res = dfft_cuda_init_local_fft();
        #else
        return 2;
        #endif
        }

    if (res) return 1;

    int size = size_in;

    p->dfft_multi = 0;

    p->device = device;

    if (device)
        {
        /* use multidimensional local transforms */
        dfft_create_execution_flow(p);

        /* allocate storage for variables */
        int dmax = p->max_depth + 2;
        p->rev_j1 = (int **) malloc(sizeof(int *)*dmax);
        p->rev_global = (int **) malloc(sizeof(int *)*dmax);
        p->rev_partial = (int **) malloc(sizeof(int *)*dmax);
        p->c0 = (int **) malloc(sizeof(int *)*dmax);
        p->c1 = (int **) malloc(sizeof(int *)*dmax);
        int d;
        for (d = 0; d < dmax; ++d)
            {
            p->rev_j1[d] = (int *) malloc(sizeof(int)*ndim);
            p->rev_global[d] = (int *) malloc(sizeof(int)*ndim);
            p->rev_partial[d] = (int *) malloc(sizeof(int)*ndim);
            p->c0[d] = (int *) malloc(sizeof(int)*ndim);
            p->c1[d] = (int *) malloc(sizeof(int)*ndim);
            }

        p->dfft_multi = 1;
        }
    else
        {
        for (i = 0; i < ndim; ++i)
            {
            /* plan for short-distance butterflies */
            int st = size/p->inembed[i]*(gdim[i]/pdim[i]);

            #ifdef ENABLE_HOST
            int howmany = 1;
            #ifdef FFT1D_SUPPORTS_THREADS
            howmany = st/(p->k0[i]);
            #endif
            dfft_create_1d_plan(&(p->plans_short_forward[i]),p->k0[i],
                howmany, st/(p->k0[i]), 1, st/(p->k0[i]), 1, 0);
            dfft_create_1d_plan(&(p->plans_short_inverse[i]),p->k0[i],
                howmany, st/(p->k0[i]), 1, st/(p->k0[i]), 1, 1);

            /* plan for long-distance butterflies */
            int length = gdim[i]/pdim[i];
            #ifdef FFT1D_SUPPORTS_THREADS
            howmany = st/length;
            #endif
            dfft_create_1d_plan(&(p->plans_long_forward[i]), length,
                howmany, st/length,1, st/length,1, 0);
            dfft_create_1d_plan(&(p->plans_long_inverse[i]), length,
                howmany, st/length,1, st/length,1, 1);
            #else
            return 3;
            #endif

            size /= p->inembed[i];
            size *= p->oembed[i];
            }
        }

    /* Allocate scratch space */
    int scratch_size = 1;
    for (i = 0; i < ndim; ++i)
        scratch_size *= ((p->inembed[i] > p->oembed[i]) ? p->inembed[i]  : p->oembed[i]);
    p->scratch_size = scratch_size;

    if (!device)
        {
        #ifdef ENABLE_HOST
        dfft_allocate_aligned_memory(&(p->scratch),sizeof(cpx_t)*scratch_size);
        dfft_allocate_aligned_memory(&(p->scratch_2),sizeof(cpx_t)*scratch_size);
        dfft_allocate_aligned_memory(&(p->scratch_3),sizeof(cpx_t)*scratch_size);
        #else
        return 3;
        #endif
        }
    else
        {
        #ifdef ENABLE_CUDA
        dfft_cuda_allocate_aligned_memory(&(p->d_scratch),sizeof(cuda_cpx_t)*scratch_size);
        dfft_cuda_allocate_aligned_memory(&(p->d_scratch_2),sizeof(cuda_cpx_t)*scratch_size);
        dfft_cuda_allocate_aligned_memory(&(p->d_scratch_3),sizeof(cuda_cpx_t)*scratch_size);
        #else
        return 2;
        #endif
        }

    p->input_cyclic = input_cyclic;
    p->output_cyclic = output_cyclic;

    #ifdef ENABLE_CUDA
    #ifndef NDEBUG
    p->check_cuda_errors = 1;
    #else
    p->check_cuda_errors = 0;
    #endif
    #endif

    p->row_m = row_m;

    /* before plan creation is complete, an initialization run will
     * be performed */
    p->init = 1;

    return 0;
    }