void bspfft1d_init(int n1, int N, int s, int t, double *w0, double *w, double *tw, int *rho_np, int *rho_p){ /* This parallel function initializes all the tables used in the FFT. */ int nlc, k1, ntw, c; double alpha; nlc= nloc(N,t,n1); bitrev_init(nlc,rho_np); bitrev_init(N,rho_p); k1= k1_init(n1,N,nlc); ufft_init(k1,w0); ufft_init(nlc,w); ntw= 0; for (c=k1; c<=N; c *=nlc){ alpha= (t%c) / (double)(c); twiddle_init(nlc,alpha,rho_np,&tw[2*ntw*nlc]); ntw++; } } /* end bspfft_init */
int dfft_create_plan_common(dfft_plan *p, int ndim, int *gdim, int *inembed, int *oembed, int *pdim, int *pidx, int row_m, int input_cyclic, int output_cyclic, MPI_Comm comm, int *proc_map, int device) { int nump; p->comm = comm; MPI_Comm_size(comm,&nump); /* number of processor must be power of two */ if (nump & (nump-1)) return 4; /* Allocate memory for processor map and copy over */ p->proc_map = malloc(sizeof(int)*nump); memcpy(p->proc_map, proc_map, sizeof(int)*nump); p->pdim = malloc(ndim*sizeof(int)); p->gdim = malloc(ndim*sizeof(int)); p->pidx = malloc(ndim*sizeof(int)); p->inembed = malloc(ndim*sizeof(int)); p->oembed = malloc(ndim*sizeof(int)); p->ndim = ndim; int i; for (i = 0; i < ndim; i++) { p->gdim[i] = gdim[i]; /* Every dimension must be a power of two */ if (gdim[i] & (gdim[i]-1)) return 5; p->pdim[i] = pdim[i]; } if (inembed != NULL) { for (i = 0; i < ndim; i++) p->inembed[i] = inembed[i]; } else { for (i = 0; i < ndim; i++) p->inembed[i] = p->gdim[i]/p->pdim[i]; } if (oembed != NULL) { for (i = 0; i < ndim; i++) p->oembed[i] = oembed[i]; } else { for (i = 0; i < ndim; i++) p->oembed[i] = p->gdim[i]/p->pdim[i]; } p->offset_send = (int *)malloc(sizeof(int)*nump); p->offset_recv = (int *)malloc(sizeof(int)*nump); p->nsend = (int *)malloc(sizeof(int)*nump); p->nrecv = (int *)malloc(sizeof(int)*nump); if (!device) { #ifdef ENABLE_HOST p->plans_short_forward = malloc(sizeof(plan_t)*ndim); p->plans_long_forward = malloc(sizeof(plan_t)*ndim); p->plans_short_inverse = malloc(sizeof(plan_t)*ndim); p->plans_long_inverse = malloc(sizeof(plan_t)*ndim); #else return 3; #endif } /* local problem size */ int size_in = 1; int size_out = 1; /* since we expect column-major input, the leading dimension has no embedding */ p->inembed[0] = gdim[0]/pdim[0]; p->oembed[0] = gdim[0]/pdim[0]; for (i = 0; i < ndim ; ++i) { size_in *= p->inembed[i]; size_out *= p->oembed[i]; } p->size_in = size_in; p->size_out = size_out; /* find length k0 of last stage of butterflies */ p->k0 = malloc(sizeof(int)*ndim); for (i = 0; i< ndim; ++i) { int length = gdim[i]/pdim[i]; if (length > 1) { int c; for (c=gdim[i]; c>length; c /= length) ; p->k0[i] = c; } else { p->k0[i] = 1; } } p->rho_L = (int **)malloc(ndim*sizeof(int *)); p->rho_pk0= (int **)malloc(ndim*sizeof(int *)); p->rho_Lk0 = (int **)malloc(ndim*sizeof(int *)); for (i = 0; i < ndim; ++i) { int length = gdim[i]/pdim[i]; p->rho_L[i] = (int *) malloc(sizeof(int)*length); p->rho_pk0[i] = (int *) malloc(sizeof(int)*pdim[i]/(p->k0[i])); p->rho_Lk0[i] = (int *) malloc(sizeof(int)*length/(p->k0[i])); bitrev_init(length, p->rho_L[i]); bitrev_init(pdim[i]/(p->k0[i]),p->rho_pk0[i]); bitrev_init(length/(p->k0[i]),p->rho_Lk0[i]); } /* processor coordinates */ for (i = 0; i < ndim; ++i) { p->pidx[i] = pidx[i]; } /* init local FFT library */ int res; if (!device) { #ifdef ENABLE_HOST res = dfft_init_local_fft(); #else return 3; #endif } else { #ifdef ENABLE_CUDA res = dfft_cuda_init_local_fft(); #else return 2; #endif } if (res) return 1; int size = size_in; p->dfft_multi = 0; p->device = device; if (device) { /* use multidimensional local transforms */ dfft_create_execution_flow(p); /* allocate storage for variables */ int dmax = p->max_depth + 2; p->rev_j1 = (int **) malloc(sizeof(int *)*dmax); p->rev_global = (int **) malloc(sizeof(int *)*dmax); p->rev_partial = (int **) malloc(sizeof(int *)*dmax); p->c0 = (int **) malloc(sizeof(int *)*dmax); p->c1 = (int **) malloc(sizeof(int *)*dmax); int d; for (d = 0; d < dmax; ++d) { p->rev_j1[d] = (int *) malloc(sizeof(int)*ndim); p->rev_global[d] = (int *) malloc(sizeof(int)*ndim); p->rev_partial[d] = (int *) malloc(sizeof(int)*ndim); p->c0[d] = (int *) malloc(sizeof(int)*ndim); p->c1[d] = (int *) malloc(sizeof(int)*ndim); } p->dfft_multi = 1; } else { for (i = 0; i < ndim; ++i) { /* plan for short-distance butterflies */ int st = size/p->inembed[i]*(gdim[i]/pdim[i]); #ifdef ENABLE_HOST int howmany = 1; #ifdef FFT1D_SUPPORTS_THREADS howmany = st/(p->k0[i]); #endif dfft_create_1d_plan(&(p->plans_short_forward[i]),p->k0[i], howmany, st/(p->k0[i]), 1, st/(p->k0[i]), 1, 0); dfft_create_1d_plan(&(p->plans_short_inverse[i]),p->k0[i], howmany, st/(p->k0[i]), 1, st/(p->k0[i]), 1, 1); /* plan for long-distance butterflies */ int length = gdim[i]/pdim[i]; #ifdef FFT1D_SUPPORTS_THREADS howmany = st/length; #endif dfft_create_1d_plan(&(p->plans_long_forward[i]), length, howmany, st/length,1, st/length,1, 0); dfft_create_1d_plan(&(p->plans_long_inverse[i]), length, howmany, st/length,1, st/length,1, 1); #else return 3; #endif size /= p->inembed[i]; size *= p->oembed[i]; } } /* Allocate scratch space */ int scratch_size = 1; for (i = 0; i < ndim; ++i) scratch_size *= ((p->inembed[i] > p->oembed[i]) ? p->inembed[i] : p->oembed[i]); p->scratch_size = scratch_size; if (!device) { #ifdef ENABLE_HOST dfft_allocate_aligned_memory(&(p->scratch),sizeof(cpx_t)*scratch_size); dfft_allocate_aligned_memory(&(p->scratch_2),sizeof(cpx_t)*scratch_size); dfft_allocate_aligned_memory(&(p->scratch_3),sizeof(cpx_t)*scratch_size); #else return 3; #endif } else { #ifdef ENABLE_CUDA dfft_cuda_allocate_aligned_memory(&(p->d_scratch),sizeof(cuda_cpx_t)*scratch_size); dfft_cuda_allocate_aligned_memory(&(p->d_scratch_2),sizeof(cuda_cpx_t)*scratch_size); dfft_cuda_allocate_aligned_memory(&(p->d_scratch_3),sizeof(cuda_cpx_t)*scratch_size); #else return 2; #endif } p->input_cyclic = input_cyclic; p->output_cyclic = output_cyclic; #ifdef ENABLE_CUDA #ifndef NDEBUG p->check_cuda_errors = 1; #else p->check_cuda_errors = 0; #endif #endif p->row_m = row_m; /* before plan creation is complete, an initialization run will * be performed */ p->init = 1; return 0; }