/* compute n^m mod p, where m >= 0 and p > 0. */ static int power_mod(int n, int m, int p) { if (m == 0) return 1; else if (m % 2 == 0) { int x = power_mod(n, m / 2, p); return MULMOD(x, x, p); } else return MULMOD(n, power_mod(n, m - 1, p), p); }
INT X(power_mod)(INT n, INT m, INT p) { A(p > 0); if (m == 0) return 1; else if (m % 2 == 0) { INT x = X(power_mod)(n, m / 2, p); return MULMOD(x, x, p); } else return MULMOD(n, X(power_mod)(n, m - 1, p), p); }
/* forward transform, sign = -1; transform length = 3 * 2^n */ int four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) { mpd_size_t R = 3; /* number of rows */ mpd_size_t C = n / 3; /* number of columns */ mpd_uint_t w3table[3]; mpd_uint_t kernel, w0, w1, wstep; mpd_uint_t *s, *p0, *p1, *p2; mpd_uint_t umod; #ifdef PPRO double dmod; uint32_t dinvmod[3]; #endif mpd_size_t i, k; assert(n >= 48); assert(n <= 3*MPD_MAXTRANSFORM_2N); SETMODULUS(modnum); _mpd_init_w3table(w3table, -1, modnum); /* size three ntt on the columns */ for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { SIZE3_NTT(p0, p1, p2, w3table); } kernel = _mpd_getkernel(n, -1, modnum); for (i = 1; i < R; i++) { w0 = 1; w1 = POWMOD(kernel, i); wstep = MULMOD(w1, w1); for (k = 0; k < C-1; k += 2) { mpd_uint_t x0 = a[i*C+k]; mpd_uint_t x1 = a[i*C+k+1]; MULMOD2(&x0, w0, &x1, w1); MULMOD2C(&w0, &w1, wstep); a[i*C+k] = x0; a[i*C+k+1] = x1; } } /* transform rows */ for (s = a; s < a+n; s += C) { if (!six_step_fnt(s, C, modnum)) { return 0; } } #if 0 /* An unordered transform is sufficient for convolution. */ if (ordered) { transpose_3xpow2(a, R, C); } #endif return 1; }
static R *mkomega(enum wakefulness wakefulness, plan *p_, INT n, INT ginv) { plan_dft *p = (plan_dft *) p_; R *omega; INT i, gpower; trigreal scale; triggen *t; if ((omega = X(rader_tl_find)(n, n, ginv, omegas))) return omega; omega = (R *)MALLOC(sizeof(R) * (n - 1) * 2, TWIDDLES); scale = n - 1.0; /* normalization for convolution */ t = X(mktriggen)(wakefulness, n); for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) { trigreal w[2]; t->cexpl(t, gpower, w); omega[2*i] = w[0] / scale; omega[2*i+1] = FFT_SIGN * w[1] / scale; } X(triggen_destroy)(t); A(gpower == 1); p->apply(p_, omega, omega + 1, omega, omega + 1); X(rader_tl_insert)(n, n, ginv, omega, &omegas); return omega; }
/* backward transform, sign = 1; transform length = 3 * 2**n */ int inv_four_step_fnt(const mpd_context_t *ctx, mpd_uint_t *a, mpd_size_t n, int modnum) { mpd_size_t R = 3; /* number of rows */ mpd_size_t C = n / 3; /* number of columns */ mpd_uint_t w3table[3]; mpd_uint_t kernel, w0, w1, wstep; mpd_uint_t *s, *p0, *p1, *p2; mpd_uint_t umod; #ifdef PPRO double dmod; uint32_t dinvmod[3]; #endif mpd_size_t i, k; assert(n >= 48); assert(n <= 3*MPD_MAXTRANSFORM_2N); #if 0 /* An unordered transform is sufficient for convolution. */ /* Transpose the matrix, producing an R*C matrix. */ transpose_3xpow2(a, C, R); #endif /* Length C transform on the rows. */ for (s = a; s < a+n; s += C) { if (!inv_six_step_fnt(ctx, s, C, modnum)) { return 0; } } /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ SETMODULUS(modnum); kernel = _mpd_getkernel(n, 1, modnum); for (i = 1; i < R; i++) { w0 = 1; w1 = POWMOD(kernel, i); wstep = MULMOD(w1, w1); for (k = 0; k < C; k += 2) { mpd_uint_t x0 = a[i*C+k]; mpd_uint_t x1 = a[i*C+k+1]; MULMOD2(&x0, w0, &x1, w1); MULMOD2C(&w0, &w1, wstep); a[i*C+k] = x0; a[i*C+k+1] = x1; } } /* Length R transform on the columns. */ _mpd_init_w3table(w3table, 1, modnum); for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) { SIZE3_NTT(p0, p1, p2, w3table); } return 1; }
static inline void std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3], mpd_uint_t umod) { mpd_uint_t r1, r2; mpd_uint_t w; mpd_uint_t s, tmp; /* k = 0 -> w = 1 */ s = *x1; s = addmod(s, *x2, umod); s = addmod(s, *x3, umod); r1 = s; /* k = 1 */ s = *x1; w = w3table[1]; tmp = MULMOD(*x2, w); s = addmod(s, tmp, umod); w = w3table[2]; tmp = MULMOD(*x3, w); s = addmod(s, tmp, umod); r2 = s; /* k = 2 */ s = *x1; w = w3table[2]; tmp = MULMOD(*x2, w); s = addmod(s, tmp, umod); w = w3table[1]; tmp = MULMOD(*x3, w); s = addmod(s, tmp, umod); *x3 = s; *x2 = r2; *x1 = r1; }
/* * Find the period of n in the multiplicative group mod p (p prime). * That is, return the smallest m such that n^m == 1 mod p. */ static int period(int n, int p) { int prod = n, period = 1; while (prod != 1) { prod = MULMOD(prod, n, p); ++period; if (prod == 0) fftw_die("non-prime order in Rader\n"); } return period; }
static void apply_dit(const plan *ego_, R *ri, R *ii, R *ro, R *io) { const P_dit *ego_dit = (const P_dit *) ego_; const P *ego; plan *cld1, *cld2; int os, osm; int j, k, gpower, g, ginv, r, m; R *buf; const R *omega, *W; { plan *cld0 = ego_dit->cld; plan_dft *cld = (plan_dft *) cld0; cld->apply(cld0, ri, ii, ro, io); } ego = (const P *) ego_; cld1 = ego->cld1; cld2 = ego->cld2; r = ego->n; m = ego_dit->m; g = ego->g; ginv = ego->ginv; omega = ego->omega; W = ego_dit->W; os = ego_dit->os; osm = ego->os; gpower = 1; buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS); for (j = 0; j < m; ++j, ro += os, io += os, W += 2*(r - 1)) { /* First, permute the input and multiply by W, storing in buf: */ A(gpower == 1); for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) { E rA, iA, rW, iW; rA = ro[gpower * osm]; iA = io[gpower * osm]; rW = W[2*k]; iW = W[2*k+1]; buf[2*k] = rW * rA - iW * iA; buf[2*k + 1] = rW * iA + iW * rA; } /* gpower == g^(r-1) mod r == 1 */; apply_aux(r, ginv, cld1, cld2, omega, buf, ro[0], io[0], ro, io, osm); } X(ifree)(buf); }
static void apply_aux(int r, int ginv, plan *cld1,plan *cld2, const R *omega, R *buf, R r0, R i0, R *ro, R *io, int os) { int gpower, k; /* compute DFT of buf, storing in output (except DC): */ { plan_dft *cld = (plan_dft *) cld1; cld->apply(cld1, buf, buf+1, ro+os, io+os); } /* set output DC component: */ ro[0] = r0 + ro[os]; io[0] = i0 + io[os]; /* now, multiply by omega: */ for (k = 0; k < r - 1; ++k) { E rB, iB, rW, iW; rW = omega[2*k]; iW = omega[2*k+1]; rB = ro[(k+1)*os]; iB = io[(k+1)*os]; ro[(k+1)*os] = rW * rB - iW * iB; io[(k+1)*os] = -(rW * iB + iW * rB); } /* this will add input[0] to all of the outputs after the ifft */ ro[os] += r0; io[os] -= i0; /* inverse FFT: */ { plan_dft *cld = (plan_dft *) cld2; cld->apply(cld2, ro+os, io+os, buf, buf+1); } /* finally, do inverse permutation to unshuffle the output: */ gpower = 1; for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) { ro[gpower * os] = buf[2*k]; io[gpower * os] = -buf[2*k+1]; } A(gpower == 1); }
/* initialize transform parameters */ struct fnt_params * _mpd_init_fnt_params(mpd_size_t n, int sign, int modnum) { struct fnt_params *tparams; mpd_uint_t umod; #ifdef PPRO double dmod; uint32_t dinvmod[3]; #endif mpd_uint_t kernel, imag, w; mpd_uint_t i; mpd_size_t nhalf; assert(ispower2(n)); assert(sign == -1 || sign == 1); assert(P1 <= modnum && modnum <= P3); nhalf = n/2; tparams = mpd_sh_alloc(sizeof *tparams, nhalf, sizeof (mpd_uint_t)); if (tparams == NULL) { return NULL; } SETMODULUS(modnum); kernel = _mpd_getkernel(n, sign, modnum); imag = _mpd_getkernel(4, -sign, modnum); tparams->modnum = modnum; tparams->modulus = umod; tparams->imag = imag; tparams->kernel = kernel; w = 1; for (i = 0; i < nhalf; i++) { tparams->wtable[i] = w; w = MULMOD(w, kernel); } return tparams; }
static R *mkomega(enum wakefulness wakefulness, plan *p_, INT n, INT npad, INT ginv) { plan_rdft *p = (plan_rdft *) p_; R *omega; INT i, gpower; trigreal scale; triggen *t; if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas))) return omega; omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES); scale = npad; /* normalization for convolution */ t = X(mktriggen)(wakefulness, n); for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) { trigreal w[2]; t->cexpl(t, gpower, w); omega[i] = (w[0] + w[1]) / scale; } X(triggen_destroy)(t); A(gpower == 1); A(npad == n - 1 || npad >= 2*(n - 1) - 1); for (; i < npad; ++i) omega[i] = K(0.0); if (npad > n - 1) for (i = 1; i < n-1; ++i) omega[npad - i] = omega[n - 1 - i]; p->apply(p_, omega, omega); X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas); return omega; }
static R *mktwiddle(int m, int r, int g) { int i, j, gpower; int n = r * m; R *W; if ((W = X(rader_tl_find)(m, r, g, twiddles)) != 0 ) return W; W = (R *)MALLOC(sizeof(R) * (r - 1) * m * 2, TWIDDLES); for (i = 0; i < m; ++i) { for (gpower = 1, j = 0; j < r - 1; ++j, gpower = MULMOD(gpower, g, r)) { int k = i * (r - 1) + j; W[2*k] = X(cos2pi)(i * gpower, n); W[2*k+1] = FFT_SIGN * X(sin2pi)(i * gpower, n); } A(gpower == 1); } X(rader_tl_insert)(m, r, g, W, &twiddles); return W; }
static void awake(plan *ego_, enum wakefulness wakefulness) { P *ego = (P *) ego_; X(plan_awake)(ego->cld1, wakefulness); X(plan_awake)(ego->cld2, wakefulness); X(plan_awake)(ego->cld_omega, wakefulness); switch (wakefulness) { case SLEEPY: free_omega(ego->omega); ego->omega = 0; break; default: ego->g = X(find_generator)(ego->n); ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n); A(MULMOD(ego->g, ego->ginv, ego->n) == 1); A(!ego->omega); ego->omega = mkomega(wakefulness, ego->cld_omega,ego->n,ego->npad,ego->ginv); break; } }
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io) { const P *ego = (const P *) ego_; int is; int k, gpower, g, r; R *buf; r = ego->n; is = ego->is; g = ego->g; buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS); /* First, permute the input, storing in buf: */ for (gpower = 1, k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) { R rA, iA; rA = ri[gpower * is]; iA = ii[gpower * is]; buf[2*k] = rA; buf[2*k + 1] = iA; } /* gpower == g^(r-1) mod r == 1 */; apply_aux(r, ego->ginv, ego->cld1, ego->cld2, ego->omega, buf, ri[0], ii[0], ro, io, ego->os); X(ifree)(buf); }
void fftw_twiddle_rader(fftw_complex *A, const fftw_complex *W, int m, int r, int stride, fftw_rader_data * d) { fftw_complex *tmp = (fftw_complex *) fftw_malloc((r - 1) * sizeof(fftw_complex)); int i, k, gpower = 1, g = d->g, ginv = d->ginv; fftw_real a0r, a0i; fftw_complex *omega = d->omega; for (i = 0; i < m; ++i, A += stride, W += r - 1) { /* * Here, we fft W[k-1] * A[k*(m*stride)], using Rader. * (Actually, W is pre-permuted to match the permutation that we * will do on A.) */ /* First, permute the input and multiply by W, storing in tmp: */ /* gpower == g^k mod r in the following loop */ for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) { fftw_real rA, iA, rW, iW; rW = c_re(W[k]); iW = c_im(W[k]); rA = c_re(A[gpower * (m * stride)]); iA = c_im(A[gpower * (m * stride)]); c_re(tmp[k]) = rW * rA - iW * iA; c_im(tmp[k]) = rW * iA + iW * rA; } WHEN_DEBUG( { if (gpower != 1) fftw_die("incorrect generator in Rader\n"); } ); /* FFT tmp to A: */ fftw_executor_simple(r - 1, tmp, A + (m * stride), d->plan->root, 1, m * stride, d->plan->recurse_kind); /* set output DC component: */ a0r = c_re(A[0]); a0i = c_im(A[0]); c_re(A[0]) += c_re(A[(m * stride)]); c_im(A[0]) += c_im(A[(m * stride)]); /* now, multiply by omega: */ for (k = 0; k < r - 1; ++k) { fftw_real rA, iA, rW, iW; rW = c_re(omega[k]); iW = c_im(omega[k]); rA = c_re(A[(k + 1) * (m * stride)]); iA = c_im(A[(k + 1) * (m * stride)]); c_re(A[(k + 1) * (m * stride)]) = rW * rA - iW * iA; c_im(A[(k + 1) * (m * stride)]) = -(rW * iA + iW * rA); } /* this will add A[0] to all of the outputs after the ifft */ c_re(A[(m * stride)]) += a0r; c_im(A[(m * stride)]) -= a0i; /* inverse FFT: */ fftw_executor_simple(r - 1, A + (m * stride), tmp, d->plan->root, m * stride, 1, d->plan->recurse_kind); /* finally, do inverse permutation to unshuffle the output: */ for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) { c_re(A[gpower * (m * stride)]) = c_re(tmp[k]); c_im(A[gpower * (m * stride)]) = -c_im(tmp[k]); } WHEN_DEBUG( { if (gpower != 1) fftw_die("incorrect generator in Rader\n"); } );
/* forward transform with sign = -1 */ int six_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum) { struct fnt_params *tparams; mpd_size_t log2n, C, R; mpd_uint_t kernel; mpd_uint_t umod; #ifdef PPRO double dmod; uint32_t dinvmod[3]; #endif mpd_uint_t *x, w0, w1, wstep; mpd_size_t i, k; assert(ispower2(n)); assert(n >= 16); assert(n <= MPD_MAXTRANSFORM_2N); log2n = mpd_bsr(n); C = ((mpd_size_t)1) << (log2n / 2); /* number of columns */ R = ((mpd_size_t)1) << (log2n - (log2n / 2)); /* number of rows */ /* Transpose the matrix. */ if (!transpose_pow2(a, R, C)) { return 0; } /* Length R transform on the rows. */ if ((tparams = _mpd_init_fnt_params(R, -1, modnum)) == NULL) { return 0; } for (x = a; x < a+n; x += R) { fnt_dif2(x, R, tparams); } /* Transpose the matrix. */ if (!transpose_pow2(a, C, R)) { mpd_free(tparams); return 0; } /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */ SETMODULUS(modnum); kernel = _mpd_getkernel(n, -1, modnum); for (i = 1; i < R; i++) { w0 = 1; /* r**(i*0): initial value for k=0 */ w1 = POWMOD(kernel, i); /* r**(i*1): initial value for k=1 */ wstep = MULMOD(w1, w1); /* r**(2*i) */ for (k = 0; k < C; k += 2) { mpd_uint_t x0 = a[i*C+k]; mpd_uint_t x1 = a[i*C+k+1]; MULMOD2(&x0, w0, &x1, w1); MULMOD2C(&w0, &w1, wstep); /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */ a[i*C+k] = x0; a[i*C+k+1] = x1; } } /* Length C transform on the rows. */ if (C != R) { mpd_free(tparams); if ((tparams = _mpd_init_fnt_params(C, -1, modnum)) == NULL) { return 0; } } for (x = a; x < a+n; x += C) { fnt_dif2(x, C, tparams); } mpd_free(tparams); #if 0 /* An unordered transform is sufficient for convolution. */ /* Transpose the matrix. */ if (!transpose_pow2(a, R, C)) { return 0; } #endif return 1; }
static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io) { const P *ego = (const P *) ego_; INT is, os; INT k, gpower, g, r; R *buf; R r0 = ri[0], i0 = ii[0]; r = ego->n; is = ego->is; os = ego->os; g = ego->g; buf = (R *) MALLOC(sizeof(R) * (r - 1) * 2, BUFFERS); /* First, permute the input, storing in buf: */ for (gpower = 1, k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, g, r)) { R rA, iA; rA = ri[gpower * is]; iA = ii[gpower * is]; buf[2*k] = rA; buf[2*k + 1] = iA; } /* gpower == g^(r-1) mod r == 1 */; /* compute DFT of buf, storing in output (except DC): */ { plan_dft *cld = (plan_dft *) ego->cld1; cld->apply(ego->cld1, buf, buf+1, ro+os, io+os); } /* set output DC component: */ { ro[0] = r0 + ro[os]; io[0] = i0 + io[os]; } /* now, multiply by omega: */ { const R *omega = ego->omega; for (k = 0; k < r - 1; ++k) { E rB, iB, rW, iW; rW = omega[2*k]; iW = omega[2*k+1]; rB = ro[(k+1)*os]; iB = io[(k+1)*os]; ro[(k+1)*os] = rW * rB - iW * iB; io[(k+1)*os] = -(rW * iB + iW * rB); } } /* this will add input[0] to all of the outputs after the ifft */ ro[os] += r0; io[os] -= i0; /* inverse FFT: */ { plan_dft *cld = (plan_dft *) ego->cld2; cld->apply(ego->cld2, ro+os, io+os, buf, buf+1); } /* finally, do inverse permutation to unshuffle the output: */ { INT ginv = ego->ginv; gpower = 1; for (k = 0; k < r - 1; ++k, gpower = MULMOD(gpower, ginv, r)) { ro[gpower * os] = buf[2*k]; io[gpower * os] = -buf[2*k+1]; } A(gpower == 1); } X(ifree)(buf); }
static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io, planner *plnr) { plan *cld1 = (plan *) 0; plan *cld2 = (plan *) 0; plan *cld_omega = (plan *) 0; R *buf = (R *) 0; /* initial allocation for the purpose of planning */ buf = (R *) MALLOC(sizeof(R) * (n - 1) * 2, BUFFERS); cld1 = X(mkplan_f_d)(plnr, X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, os), X(mktensor_1d)(1, 0, 0), buf, buf + 1, ro + os, io + os), NO_SLOW, 0, 0); if (!cld1) goto nada; cld2 = X(mkplan_f_d)(plnr, X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, os, 2), X(mktensor_1d)(1, 0, 0), ro + os, io + os, buf, buf + 1), NO_SLOW, 0, 0); if (!cld2) goto nada; /* plan for omega array */ cld_omega = X(mkplan_f_d)(plnr, X(mkproblem_dft_d)(X(mktensor_1d)(n - 1, 2, 2), X(mktensor_1d)(1, 0, 0), buf, buf + 1, buf, buf + 1), NO_SLOW, ESTIMATE, 0); if (!cld_omega) goto nada; /* deallocate buffers; let awake() or apply() allocate them for real */ X(ifree)(buf); buf = 0; pln->cld1 = cld1; pln->cld2 = cld2; pln->cld_omega = cld_omega; pln->omega = 0; pln->n = n; pln->is = is; pln->os = os; pln->g = X(find_generator)(n); pln->ginv = X(power_mod)(pln->g, n - 2, n); A(MULMOD(pln->g, pln->ginv, n) == 1); X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops); pln->super.super.ops.other += (n - 1) * (4 * 2 + 6) + 6; pln->super.super.ops.add += (n - 1) * 2 + 4; pln->super.super.ops.mul += (n - 1) * 4; return 1; nada: X(ifree0)(buf); X(plan_destroy_internal)(cld_omega); X(plan_destroy_internal)(cld2); X(plan_destroy_internal)(cld1); return 0; }
static fftw_rader_data *create_rader_aux(int p, int flags) { fftw_complex *omega, *work; int g, ginv, gpower; int i; FFTW_TRIG_REAL twoPiOverN; fftw_real scale = 1.0 / (p - 1); /* for convolution */ fftw_plan plan; fftw_rader_data *d; if (p < 2) fftw_die("non-prime order in Rader\n"); flags &= ~FFTW_IN_PLACE; d = (fftw_rader_data *) fftw_malloc(sizeof(fftw_rader_data)); g = find_generator(p); ginv = power_mod(g, p - 2, p); omega = (fftw_complex *) fftw_malloc((p - 1) * sizeof(fftw_complex)); plan = fftw_create_plan(p - 1, FFTW_FORWARD, flags & ~FFTW_NO_VECTOR_RECURSE); work = (fftw_complex *) fftw_malloc((p - 1) * sizeof(fftw_complex)); twoPiOverN = FFTW_K2PI / (FFTW_TRIG_REAL) p; gpower = 1; for (i = 0; i < p - 1; ++i) { c_re(work[i]) = scale * FFTW_TRIG_COS(twoPiOverN * gpower); c_im(work[i]) = FFTW_FORWARD * scale * FFTW_TRIG_SIN(twoPiOverN * gpower); gpower = MULMOD(gpower, ginv, p); } /* fft permuted roots of unity */ fftw_executor_simple(p - 1, work, omega, plan->root, 1, 1, plan->recurse_kind); fftw_free(work); d->plan = plan; d->omega = omega; d->g = g; d->ginv = ginv; d->p = p; d->flags = flags; d->refcount = 1; d->next = NULL; d->cdesc = (fftw_codelet_desc *) fftw_malloc(sizeof(fftw_codelet_desc)); d->cdesc->name = NULL; d->cdesc->codelet = NULL; d->cdesc->size = p; d->cdesc->dir = FFTW_FORWARD; d->cdesc->type = FFTW_RADER; d->cdesc->signature = g; d->cdesc->ntwiddle = 0; d->cdesc->twiddle_order = NULL; return d; }
static R *compute(enum wakefulness wakefulness, const tw_instr *instr, INT n, INT r, INT m) { INT ntwiddle, j, vl; R *W, *W0; const tw_instr *p; triggen *t = X(mktriggen)(wakefulness, n); p = instr; ntwiddle = twlen0(r, p, &vl); A(m % vl == 0); W0 = W = (R *)MALLOC((ntwiddle * (m / vl)) * sizeof(R), TWIDDLES); for (j = 0; j < m; j += vl) { for (p = instr; p->op != TW_NEXT; ++p) { switch (p->op) { case TW_FULL: { INT i; for (i = 1; i < r; ++i) { A((j + (INT)p->v) * i < n); A((j + (INT)p->v) * i > -n); t->cexp(t, (j + (INT)p->v) * i, W); W += 2; } break; } case TW_HALF: { INT i; A((r % 2) == 1); for (i = 1; i + i < r; ++i) { t->cexp(t, MULMOD(i, (j + (INT)p->v), n), W); W += 2; } break; } case TW_COS: { R d[2]; A((j + (INT)p->v) * p->i < n); A((j + (INT)p->v) * p->i > -n); t->cexp(t, (j + (INT)p->v) * (INT)p->i, d); *W++ = d[0]; break; } case TW_SIN: { R d[2]; A((j + (INT)p->v) * p->i < n); A((j + (INT)p->v) * p->i > -n); t->cexp(t, (j + (INT)p->v) * (INT)p->i, d); *W++ = d[1]; break; } case TW_CEXP: A((j + (INT)p->v) * p->i < n); A((j + (INT)p->v) * p->i > -n); t->cexp(t, (j + (INT)p->v) * (INT)p->i, W); W += 2; break; } } } X(triggen_destroy)(t); return W0; }
static void apply(const plan *ego_, R *I, R *O) { const P *ego = (const P *) ego_; INT n = ego->n; /* prime */ INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */ INT is = ego->is, os; INT k, gpower, g; R *buf, *omega; R r0; buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS); /* First, permute the input, storing in buf: */ g = ego->g; for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { buf[k] = I[gpower * is]; } /* gpower == g^(n-1) mod n == 1 */; A(n - 1 <= npad); for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */ buf[k] = 0; os = ego->os; /* compute RDFT of buf, storing in buf (i.e., in-place): */ { plan_rdft *cld = (plan_rdft *) ego->cld1; cld->apply((plan *) cld, buf, buf); } /* set output DC component: */ O[0] = (r0 = I[0]) + buf[0]; /* now, multiply by omega: */ omega = ego->omega; buf[0] *= omega[0]; for (k = 1; k < npad/2; ++k) { E rB, iB, rW, iW, a, b; rW = omega[k]; iW = omega[npad - k]; rB = buf[k]; iB = buf[npad - k]; a = rW * rB - iW * iB; b = rW * iB + iW * rB; #if R2HC_ONLY_CONV buf[k] = a + b; buf[npad - k] = a - b; #else buf[k] = a; buf[npad - k] = b; #endif } /* Nyquist component: */ A(k + k == npad); /* since npad is even */ buf[k] *= omega[k]; /* this will add input[0] to all of the outputs after the ifft */ buf[0] += r0; /* inverse FFT: */ { plan_rdft *cld = (plan_rdft *) ego->cld2; cld->apply((plan *) cld, buf, buf); } /* do inverse permutation to unshuffle the output: */ A(gpower == 1); #if R2HC_ONLY_CONV O[os] = buf[0]; gpower = g = ego->ginv; A(npad == n - 1 || npad/2 >= n - 1); if (npad == n - 1) { for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) { O[gpower * os] = buf[k] + buf[npad - k]; } O[gpower * os] = buf[k]; ++k, gpower = MULMOD(gpower, g, n); for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) { O[gpower * os] = buf[npad - k] - buf[k]; } } else { for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { O[gpower * os] = buf[k] + buf[npad - k]; } } #else g = ego->ginv; for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) { O[gpower * os] = buf[k]; } #endif A(gpower == 1); X(ifree)(buf); }
static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) { const S *ego = (const S *) ego_; const problem_rdft *p = (const problem_rdft *) p_; P *pln; INT n, npad; INT is, os; plan *cld1 = (plan *) 0; plan *cld2 = (plan *) 0; plan *cld_omega = (plan *) 0; float *buf = (float *) 0; problem *cldp; static const plan_adt padt = { fftwf_rdft_solve, awake, print, destroy }; if (!applicable(ego_, p_, plnr)) return (plan *) 0; n = p->sz->dims[0].n; is = p->sz->dims[0].is; os = p->sz->dims[0].os; if (ego->pad) npad = choose_transform_size(2 * (n - 1) - 1); else npad = n - 1; /* initial allocation for the purpose of planning */ buf = (float *) MALLOC(sizeof(float) * npad, BUFFERS); cld1 = fftwf_mkplan_f_d(plnr,fftwf_mkproblem_rdft_1_d(fftwf_mktensor_1d(npad, 1, 1), fftwf_mktensor_1d(1, 0, 0), buf, buf, R2HC), NO_SLOW, 0, 0); if (!cld1) goto nada; cldp = fftwf_mkproblem_rdft_1_d( fftwf_mktensor_1d(npad, 1, 1), fftwf_mktensor_1d(1, 0, 0), buf, buf, #if R2HC_ONLY_CONV R2HC #else HC2R #endif ); if (!(cld2 = fftwf_mkplan_f_d(plnr, cldp, NO_SLOW, 0, 0))) goto nada; /* plan for omega */ cld_omega = fftwf_mkplan_f_d(plnr, fftwf_mkproblem_rdft_1_d( fftwf_mktensor_1d(npad, 1, 1), fftwf_mktensor_1d(1, 0, 0), buf, buf, R2HC), NO_SLOW, ESTIMATE, 0); if (!cld_omega) goto nada; /* deallocate buffers; let awake() or apply() allocate them for real */ fftwf_ifree(buf); buf = 0; pln = MKPLAN_RDFT(P, &padt, apply); pln->cld1 = cld1; pln->cld2 = cld2; pln->cld_omega = cld_omega; pln->omega = 0; pln->n = n; pln->npad = npad; pln->is = is; pln->os = os; pln->g = fftwf_find_generator(n); pln->ginv = fftwf_power_mod(pln->g, n - 2, n); A(MULMOD(pln->g, pln->ginv, n) == 1); fftwf_ops_add(&cld1->ops, &cld2->ops, &pln->super.super.ops); pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad; pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad; pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad; #if R2HC_ONLY_CONV pln->super.super.ops.other += n-2 - ego->pad; pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad; #endif return &(pln->super.super); nada: fftwf_ifree0(buf); fftwf_plan_destroy_internal(cld_omega); fftwf_plan_destroy_internal(cld2); fftwf_plan_destroy_internal(cld1); return 0; }