void mulmid_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2, ulong* p, size_t n1, size_t n2) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); unsigned _lgM; size_t _m1; ulong M, _p; // increase lgM until all the conditions are satisfied for (_lgM = 1; ; _lgM++) { M = 1UL << _lgM; _p = ((-n2) & (M/2 - 1)) + 1; _m1 = CEIL_DIV_2EXP (n1 + _p, _lgM - 1); if (_m1 <= 2 * M) break; } *lgM = _lgM; *lgK = (_m1 > M) ? (_lgM + 1) : _lgM; *p = _p; *m1 = _m1; *m2 = CEIL_DIV_2EXP (n2, _lgM - 1); }
void zn_array_invert (ulong* res, const ulong* op, size_t n, const zn_mod_t mod) { ZNP_ASSERT (n >= 1); // for now assume input is monic ZNP_ASSERT (op[0] == 1); if (n == 1) { res[0] = 1; return; } size_t half = (n + 1) / 2; // ceil(n / 2) // recursively obtain the first half of the output zn_array_invert (res, op, half, mod); // extend to second half of the output if (mod->m & 1) zn_array_invert_extend_fft (res + half, res, op, half, n - half, mod); else zn_array_invert_extend (res + half, res, op, half, n - half, mod); }
/* Helper function for ref_zn_array_unpack(). Inverse operation of ref_zn_array_pack_helper(); each output coefficient occupies ceil(b / ULONG_BITS) ulongs. Running time is soft-linear in output length. */ void ref_zn_array_unpack_helper (ulong* res, const mpz_t op, size_t n, unsigned b, unsigned k) { ZNP_ASSERT (n >= 1); ZNP_ASSERT (mpz_sizeinbase (op, 2) <= n * b + k); unsigned w = CEIL_DIV (b, ULONG_BITS); mpz_t y; mpz_init (y); if (n == 1) { // base case unsigned i; mpz_tdiv_q_2exp (y, op, k); for (i = 0; i < w; i++) { res[i] = mpz_get_ui (y); mpz_tdiv_q_2exp (y, y, ULONG_BITS); } } else { // recursively split into top and bottom halves mpz_tdiv_q_2exp (y, op, (n / 2) * b + k); ref_zn_array_unpack_helper (res + w * (n / 2), y, n - n / 2, b, 0); mpz_tdiv_r_2exp (y, op, (n / 2) * b + k); ref_zn_array_unpack_helper (res, y, n / 2, b, k); } mpz_clear (y); }
void zn_array_mulmid_fft_precomp1_init (zn_array_mulmid_fft_precomp1_t res, const ulong* op1, size_t n1, size_t n2, ulong x, const zn_mod_t mod) { ZNP_ASSERT (mod->m & 1); ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); res->n1 = n1; res->n2 = n2; unsigned lgK, lgM; mulmid_fft_params (&lgK, &lgM, &res->m1, &res->m2, &res->p, n1, n2); ulong M = 1UL << lgM; ptrdiff_t skip = M + 1; // allocate space for transposed IFFT pmfvec_init (res->vec1, lgK, skip, lgM, mod); // split input, with padding, in reversed order, and apply requested // scaling factor pmfvec_reverse (res->vec1, res->m1); fft_split (res->vec1, op1, n1, res->p, x, 0); pmfvec_reverse (res->vec1, res->m1); // transposed IFFT first input pmfvec_tpifft (res->vec1, res->m1, 0, res->m1, 0); }
void zn_array_mulmid_fft (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, ulong x, const zn_mod_t mod) { ZNP_ASSERT (mod->m & 1); ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); // re-use the precomp1 code zn_array_mulmid_fft_precomp1_t precomp; zn_array_mulmid_fft_precomp1_init (precomp, op1, n1, n2, x, mod); zn_array_mulmid_fft_precomp1_execute (res, op2, 1, precomp); zn_array_mulmid_fft_precomp1_clear (precomp); }
void virtual_pmf_bfly (virtual_pmf_t op1, virtual_pmf_t op2) { ZNP_ASSERT (op1->parent == op2->parent); struct virtual_pmfvec_struct* parent = op1->parent; // op1 == 0 if (op1->index == -1) { virtual_pmf_set (op1, op2); return; } // op2 == 0 if (op2->index == -1) { virtual_pmf_set (op2, op1); virtual_pmf_rotate (op2, parent->M); return; } virtual_pmf_isolate (op1); virtual_pmf_isolate (op2); pmf_t p1 = parent->buf[op1->index]; pmf_t p2 = parent->buf[op2->index]; p1[0] = op1->bias; p2[0] = op2->bias; pmf_bfly (p1, p2, parent->M, parent->mod); }
void virtual_pmf_sub (virtual_pmf_t res, virtual_pmf_t op) { ZNP_ASSERT (res->parent == op->parent); struct virtual_pmfvec_struct* parent = res->parent; // op == 0 if (op->index == -1) return; // res == 0 if (res->index == -1) { virtual_pmf_set (res, op); virtual_pmf_rotate (res, parent->M); return; } virtual_pmf_isolate (res); pmf_t p2 = parent->buf[res->index]; pmf_t p1 = parent->buf[op->index]; p2[0] = res->bias; p1[0] = op->bias; pmf_sub (p2, p1, parent->M, parent->mod); }
/* tests zn_array_pack() once for given n, b, k */ int testcase_zn_array_pack (size_t n, unsigned b, unsigned k) { ZNP_ASSERT (b >= 1); ZNP_ASSERT (n >= 1); int success = 1; ulong* in = (ulong*) malloc (sizeof (ulong) * n); size_t size = CEIL_DIV (n * b + k, GMP_NUMB_BITS); mp_limb_t* res = (mp_limb_t*) malloc (sizeof (mp_limb_t) * (size + 2)); mp_limb_t* ref = (mp_limb_t*) malloc (sizeof (mp_limb_t) * (size + 2)); // sentries to check buffer overflow res[0] = res[size + 1] = ref[0] = ref[size + 1] = 0x1234; // generate random data: at most b bits per input coefficient, possibly less unsigned rand_bits = (b >= ULONG_BITS) ? ULONG_BITS : b; rand_bits = random_ulong (rand_bits) + 1; ulong max = (rand_bits == ULONG_BITS) ? ((ulong)(-1)) : ((1UL << rand_bits) - 1); size_t i; for (i = 0; i < n; i++) in[i] = random_ulong (max); // run target and reference implementation zn_array_pack (res + 1, in, n, 1, b, k, 0); ref_zn_array_pack (ref + 1, in, n, b, k); // check sentries success = success && (res[0] == 0x1234); success = success && (ref[0] == 0x1234); success = success && (res[size + 1] == 0x1234); success = success && (ref[size + 1] == 0x1234); // check correct result success = success && (mpn_cmp (res + 1, ref + 1, size) == 0); free (ref); free (res); free (in); return success; }
unsigned virtual_pmfvec_find_slot (virtual_pmfvec_t vec) { unsigned i; for (i = 0; i < vec->max_buffers; i++) if (!vec->buf[i]) return i; // this should never happen; we always should have enough slots ZNP_ASSERT (0); }
void zn_array_pack1 (mp_limb_t* res, const ulong* op, size_t n, ptrdiff_t s, unsigned b, unsigned k, size_t r) { ZNP_ASSERT (b > 0 && b <= ULONG_BITS); #if GMP_NAIL_BITS == 0 && ULONG_BITS == GMP_NUMB_BITS // where to write the next limb mp_limb_t* dest = res; // write leading zero-padding while (k >= ULONG_BITS) { *dest++ = 0; k -= ULONG_BITS; } // limb currently being filled mp_limb_t buf = 0; // number of bits used in buf; always in [0, ULONG_BITS) unsigned buf_b = k; unsigned buf_b_old; for (; n > 0; n--, op += s) { ZNP_ASSERT (b >= ULONG_BITS || *op < (1UL << b)); // put low bits of current input into buffer buf += *op << buf_b; buf_b_old = buf_b; buf_b += b; if (buf_b >= ULONG_BITS) { // buffer is full; flush it *dest++ = buf; buf_b -= ULONG_BITS; // put remaining bits of current input into buffer buf = buf_b_old ? (*op >> (ULONG_BITS - buf_b_old)) : 0; } }
void zn_array_invert_extend (ulong* res, const ulong* approx, const ulong* op, size_t n1, size_t n2, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); // The algorithm is basically newton iteration, inspired partly by the // algorithm in [HZ04], as follows. // Let f be the input series, of length n1 + n2. // Let g be the current approximation to 1/f, of length n1. // By newton iteration, (2*g - g*g*f) is a length n1 + n2 approximation // to 1/f. Therefore the output of this function should be terms // [n1, n1 + n2) of -g*g*f. // We have g*f = 1 + h*x^n1 + O(x^(n1 + n2)), where h has length n2, // i.e. h consists of terms [n1, n1 + n2) of g*f. Therefore h may be // recovered as the middle product of f[1, n1 + n2) and g[0, n1). // Then g*g*f = g + g*h*x^n1 + O(x^(n1 + n2)). Since g has length // n1, the output is (the negative of) the first n2 coefficients of g*h. // Compute h, put it in res[0, n2). zn_array_mulmid (res, op + 1, n1 + n2 - 1, approx, n1, mod); // Compute g * h, put it into a scratch buffer. ZNP_FASTALLOC (temp, ulong, 6624, n1 + n2 - 1); zn_array_mul (temp, approx, n1, res, n2, mod); // Negate the first n2 coefficients of g * h into the output buffer. zn_array_neg (res, temp, n2, mod); ZNP_FASTFREE (temp); }
void merge_chunk_to_pmf (pmf_t res, const ulong* op, size_t n, size_t k, ulong M, const zn_mod_t mod) { ZNP_ASSERT ((M & 1) == 0); ulong r = (-res[0]) & (2*M - 1); size_t end = k + M/2; if (end > n) end = n; if (k >= end) // nothing to do return; op += k; ulong size = end - k; // now we need to handle op[0, size), and we are guaranteed size <= M/2. if (r < M) { if (size <= M - r) zn_array_add_inplace (res + 1 + r, op, size, mod); else { zn_array_add_inplace (res + 1 + r, op, M - r, mod); // negacyclic wraparound: zn_array_sub_inplace (res + 1, op + M - r, size - M + r, mod); } } else { r -= M; if (size <= M - r) zn_array_sub_inplace (res + 1 + r, op, size, mod); else { zn_array_sub_inplace (res + 1 + r, op, M - r, mod); // negacyclic wraparound: zn_array_add_inplace (res + 1, op + M - r, size - M + r, mod); } } }
/* Helper function for ref_zn_array_pack(). Sets x = 2^k * (op[0] + op[1]*2^b + ... + op[n-1]*2^((n-1)*b)). Running time is soft-linear in output length. */ void ref_zn_array_pack_helper (mpz_t x, const ulong* op, size_t n, unsigned b, unsigned k) { ZNP_ASSERT (n >= 1); if (n == 1) { // base case mpz_set_ui (x, op[0]); mpz_mul_2exp (x, x, k); } else { // recursively split into top and bottom halves mpz_t y; mpz_init (y); ref_zn_array_pack_helper (x, op, n / 2, b, k); ref_zn_array_pack_helper (y, op + n / 2, n - n / 2, b, 0); mpz_mul_2exp (y, y, (n / 2) * b + k); mpz_add (x, x, y); mpz_clear (y); } }
void zn_array_invert_extend_fft (ulong* res, const ulong* approx, const ulong* op, size_t n1, size_t n2, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); ZNP_ASSERT (mod->m & 1); // The algorithm here is the same as in zn_array_invert_extend(), except // that we work with the FFTs directly. This allows us to save one FFT, // since we use the FFT of g in both the middle product step and the // product step. // Determine FFT parameters for computing h = middle product of // f[1, n1 + n2) and g[0, n1). (These parameters will also work for the // subsequent product g * h.) unsigned lgK, lgM; ulong m1, m2, m3, p; mulmid_fft_params (&lgK, &lgM, &m3, &m1, &p, n1 + n2 - 1, n1); m2 = m3 - m1 + 1; // We now have // m1 = ceil(n1 / (M/2)) // = (n1 + p - 1) / (M/2). // Therefore // m3 = ceil((n1 + n2 - 1 + p) / (M/2)) // = ceil(n2 / (M/2)) + (n1 + p - 1) / (M/2) // and // m2 = ceil(n2 / (M/2)) + 1. ulong M = 1UL << lgM; ulong K = 1UL << lgK; ptrdiff_t skip = M + 1; pmfvec_t vec1, vec2; pmfvec_init (vec1, lgK, skip, lgM, mod); pmfvec_init (vec2, lgK, skip, lgM, mod); // Find scaling factor that needs to be applied to both of the products // below; takes into account the fudge from the pointwise multiplies, and // the division by 2^lgK coming from the FFTs. ulong x = pmfvec_mul_fudge (lgM, 0, mod); x = zn_mod_mul (x, zn_mod_pow2 (-lgK, mod), mod); // Split g[0, n1) into m1 coefficients, apply scaling factor, and compute // m3 fourier coefficients, written to vec2. fft_split (vec2, approx, n1, 0, x, 0); pmfvec_fft (vec2, m3, m1, 0); // Split f[1, n1 + n2) into m3 coefficients (in reversed order, with // appropriate zero-padding), and compute transposed IFFT of length m3, // written to vec1. pmfvec_reverse (vec1, m3); fft_split (vec1, op + 1, n1 + n2 - 1, p, 1, 0); pmfvec_reverse (vec1, m3); pmfvec_tpifft (vec1, m3, 0, m3, 0); // Pointwise multiply the above FFT and transposed IFFT, into vec1. pmfvec_mul (vec1, vec1, vec2, m3, 0); // Transposed FFT vec1, obtaining m2 coefficients, then reverse and combine. pmfvec_tpfft (vec1, m3, m2, 0); pmfvec_reverse (vec1, m2); fft_combine (res, n2, vec1, m2, 1); pmfvec_reverse (vec1, m2); // At this stage we have obtained the polynomial h in res[0, n2). // Now we must compute h * g. // Split h[0, n2) into m2 - 1 coefficients, and compute m3 - 1 fourier // coefficients in vec1. For the splitting step, we set the bias to M, // which effectively negates everything, so we're really computing the FFT // of -h. fft_split (vec1, res, n2, 0, 1, M); pmfvec_fft (vec1, m3 - 1, m2 - 1, 0); // Pointwise multiply that FFT with the first FFT of g into vec2. pmfvec_mul (vec2, vec2, vec1, m3 - 1, 1); pmfvec_clear (vec1); // IFFT and combine, to obtain the product -h * g. We only need the low n2 // terms of the product (we throw away the high n1 - 1 terms). pmfvec_ifft (vec2, m3 - 1, 0, m3 - 1, 0); fft_combine (res, n2, vec2, m3 - 1, 0); pmfvec_clear (vec2); }
/* Multiplication/squaring using Kronecker substitution at 2^b, -2^b, 2^(-b) and -2^(-b). */ void zn_array_mul_KS4 (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, int redc, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); ZNP_ASSERT (n1 <= ULONG_MAX); ZNP_ASSERT ((mod->m & 1) || !redc); if (n2 == 1) { // code below needs n2 > 1, so fall back on scalar multiplication _zn_array_scalar_mul (res, op1, n1, op2[0], redc, mod); return; } int sqr = (op1 == op2 && n1 == n2); // bits in each output coefficient unsigned bits = 2 * mod->bits + ceil_lg (n2); // we're evaluating at x = B, -B, 1/B, -1/B, // where B = 2^b, and b = ceil(bits / 4) unsigned b = (bits + 3) / 4; // number of ulongs required to store each base-B^2 digit unsigned w = CEIL_DIV (2 * b, ULONG_BITS); ZNP_ASSERT (w <= 2); // Write f1(x) = f1e(x^2) + x * f1o(x^2) // f2(x) = f2e(x^2) + x * f2o(x^2) // h(x) = he(x^2) + x * ho(x^2) // "e" = even, "o" = odd size_t n1o = n1 / 2; size_t n1e = n1 - n1o; size_t n2o = n2 / 2; size_t n2e = n2 - n2o; size_t n3 = n1 + n2 - 1; // length of h size_t n3o = n3 / 2; size_t n3e = n3 - n3o; // Put k1 = number of limbs needed to store f1(B) and |f1(-B)|. // In f1(B), the leading coefficient starts at bit position b * (n1 - 1) // and has length 2b, and the coefficients overlap so we need an extra bit // for the carry: this gives (n1 + 1) * b + 1 bits. Ditto for f2. size_t k1 = CEIL_DIV ((n1 + 1) * b + 1, GMP_NUMB_BITS); size_t k2 = CEIL_DIV ((n2 + 1) * b + 1, GMP_NUMB_BITS); size_t k3 = k1 + k2; // allocate space ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 5 * k3); mp_limb_t* v1_buf0 = limbs; // k1 limbs mp_limb_t* v2_buf0 = v1_buf0 + k1; // k2 limbs mp_limb_t* v1_buf1 = v2_buf0 + k2; // k1 limbs mp_limb_t* v2_buf1 = v1_buf1 + k1; // k2 limbs mp_limb_t* v1_buf2 = v2_buf1 + k2; // k1 limbs mp_limb_t* v2_buf2 = v1_buf2 + k1; // k2 limbs mp_limb_t* v1_buf3 = v2_buf2 + k2; // k1 limbs mp_limb_t* v2_buf3 = v1_buf3 + k1; // k2 limbs mp_limb_t* v1_buf4 = v2_buf3 + k2; // k1 limbs mp_limb_t* v2_buf4 = v1_buf4 + k1; // k2 limbs // arrange overlapping buffers to minimise memory use // "p" = plus, "m" = minus // "n" = normal order, "r" = reciprocal order mp_limb_t* v1en = v1_buf0; mp_limb_t* v1on = v1_buf1; mp_limb_t* v1pn = v1_buf2; mp_limb_t* v1mn = v1_buf0; mp_limb_t* v2en = v2_buf0; mp_limb_t* v2on = v2_buf1; mp_limb_t* v2pn = v2_buf2; mp_limb_t* v2mn = v2_buf0; mp_limb_t* v3pn = v1_buf1; mp_limb_t* v3mn = v1_buf2; mp_limb_t* v3en = v1_buf0; mp_limb_t* v3on = v1_buf1; mp_limb_t* v1er = v1_buf2; mp_limb_t* v1or = v1_buf3; mp_limb_t* v1pr = v1_buf4; mp_limb_t* v1mr = v1_buf2; mp_limb_t* v2er = v2_buf2; mp_limb_t* v2or = v2_buf3; mp_limb_t* v2pr = v2_buf4; mp_limb_t* v2mr = v2_buf2; mp_limb_t* v3pr = v1_buf3; mp_limb_t* v3mr = v1_buf4; mp_limb_t* v3er = v1_buf2; mp_limb_t* v3or = v1_buf3; ZNP_FASTALLOC (z, ulong, 6624, 2 * w * (n3e + 1)); ulong* zn = z; ulong* zr = z + w * (n3e + 1); int v3m_neg; // ------------------------------------------------------------------------- // "normal" evaluation points if (!sqr) { // multiplication version // evaluate f1e(B^2) and B * f1o(B^2) // We need max(2 * b*n1e, 2 * b*n1o + b) bits for this packing step, // which is safe since (n1 + 1) * b + 1 >= max(2 * b*n1e, 2 * b*n1o + b). // Ditto for f2 below. zn_array_pack (v1en, op1, n1e, 2, 2 * b, 0, k1); zn_array_pack (v1on, op1 + 1, n1o, 2, 2 * b, b, k1); // compute f1(B) = f1e(B^2) + B * f1o(B^2) // and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v1pn, v1en, v1on, k1)); v3m_neg = signed_mpn_sub_n (v1mn, v1en, v1on, k1); // evaluate f2e(B^2) and B * f2o(B^2) zn_array_pack (v2en, op2, n2e, 2, 2 * b, 0, k2); zn_array_pack (v2on, op2 + 1, n2o, 2, 2 * b, b, k2); // compute f2(B) = f2e(B^2) + B * f2o(B^2) // and |f2(-B)| = |f2e(B^2) - B * f2o(B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v2pn, v2en, v2on, k2)); v3m_neg ^= signed_mpn_sub_n (v2mn, v2en, v2on, k2); // compute h(B) = f1(B) * f2(B) // and |h(-B)| = |f1(-B)| * |f2(-B)| // hn_neg is set if h(-B) is negative ZNP_mpn_mul (v3pn, v1pn, k1, v2pn, k2); ZNP_mpn_mul (v3mn, v1mn, k1, v2mn, k2); } else { // squaring version // evaluate f1e(B^2) and B * f1o(B^2) zn_array_pack (v1en, op1, n1e, 2, 2 * b, 0, k1); zn_array_pack (v1on, op1 + 1, n1o, 2, 2 * b, b, k1); // compute f1(B) = f1e(B^2) + B * f1o(B^2) // and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v1pn, v1en, v1on, k1)); signed_mpn_sub_n (v1mn, v1en, v1on, k1); // compute h(B) = f1(B)^2 // and h(-B) = |f1(-B)|^2 // hn_neg is cleared since h(-B) is never negative ZNP_mpn_mul (v3pn, v1pn, k1, v1pn, k1); ZNP_mpn_mul (v3mn, v1mn, k1, v1mn, k1); v3m_neg = 0; } // Each coefficient of h(B) is up to 4b bits long, so h(B) needs at most // ((n1 + n2 + 2) * b + 1) bits. (The extra +1 is to accommodate carries // generated by overlapping coefficients.) The buffer has at least // ((n1 + n2 + 2) * b + 2) bits. Therefore we can safely store 2*h(B) etc. // compute 2 * he(B^2) = h(B) + h(-B) // and B * 2 * ho(B^2) = h(B) - h(-B) if (v3m_neg) { ZNP_ASSERT_NOCARRY (mpn_sub_n (v3en, v3pn, v3mn, k3)); ZNP_ASSERT_NOCARRY (mpn_add_n (v3on, v3pn, v3mn, k3)); } else { ZNP_ASSERT_NOCARRY (mpn_add_n (v3en, v3pn, v3mn, k3)); ZNP_ASSERT_NOCARRY (mpn_sub_n (v3on, v3pn, v3mn, k3)); } // ------------------------------------------------------------------------- // "reciprocal" evaluation points // correction factors to take into account that if a polynomial has even // length, its even and odd coefficients are swapped when the polynomial // is reversed unsigned a1 = (n1 & 1) ? 0 : b; unsigned a2 = (n2 & 1) ? 0 : b; unsigned a3 = (n3 & 1) ? 0 : b; if (!sqr) { // multiplication version // evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2) zn_array_pack (v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1); zn_array_pack (v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1); // compute B^(n1-1) * f1(1/B) = // B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2) // and |B^(n1-1) * f1(-1/B)| = // |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v1pr, v1er, v1or, k1)); v3m_neg = signed_mpn_sub_n (v1mr, v1er, v1or, k1); // evaluate B^(n2-1) * f2e(1/B^2) and B^(n2-2) * f2o(1/B^2) zn_array_pack (v2er, op2 + 2*(n2e - 1), n2e, -2, 2 * b, a2, k2); zn_array_pack (v2or, op2 + 1 + 2*(n2o - 1), n2o, -2, 2 * b, b - a2, k2); // compute B^(n2-1) * f2(1/B) = // B^(n2-1) * f2e(1/B^2) + B^(n2-2) * f2o(1/B^2) // and |B^(n1-1) * f2(-1/B)| = // |B^(n2-1) * f2e(1/B^2) - B^(n2-2) * f2o(1/B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v2pr, v2er, v2or, k2)); v3m_neg ^= signed_mpn_sub_n (v2mr, v2er, v2or, k2); // compute B^(n3-1) * h(1/B) = // (B^(n1-1) * f1(1/B)) * (B^(n2-1) * f2(1/B)) // and |B^(n3-1) * h(-1/B)| = // |B^(n1-1) * f1(-1/B)| * |B^(n2-1) * f2(-1/B)| // hr_neg is set if h(-1/B) is negative ZNP_mpn_mul (v3pr, v1pr, k1, v2pr, k2); ZNP_mpn_mul (v3mr, v1mr, k1, v2mr, k2); } else { // squaring version // evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2) zn_array_pack (v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1); zn_array_pack (v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1); // compute B^(n1-1) * f1(1/B) = // B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2) // and |B^(n1-1) * f1(-1/B)| = // |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)| ZNP_ASSERT_NOCARRY (mpn_add_n (v1pr, v1er, v1or, k1)); signed_mpn_sub_n (v1mr, v1er, v1or, k1); // compute B^(n3-1) * h(1/B) = (B^(n1-1) * f1(1/B))^2 // and B^(n3-1) * h(-1/B) = |B^(n1-1) * f1(-1/B)|^2 // hr_neg is cleared since h(-1/B) is never negative ZNP_mpn_mul (v3pr, v1pr, k1, v1pr, k1); ZNP_mpn_mul (v3mr, v1mr, k1, v1mr, k1); v3m_neg = 0; } // compute 2 * B^(n3-1) * he(1/B^2) // = B^(n3-1) * h(1/B) + B^(n3-1) * h(-1/B) // and 2 * B^(n3-2) * ho(1/B^2) // = B^(n3-1) * h(1/B) - B^(n3-1) * h(-1/B) if (v3m_neg) { ZNP_ASSERT_NOCARRY (mpn_sub_n (v3er, v3pr, v3mr, k3)); ZNP_ASSERT_NOCARRY (mpn_add_n (v3or, v3pr, v3mr, k3)); } else { ZNP_ASSERT_NOCARRY (mpn_add_n (v3er, v3pr, v3mr, k3)); ZNP_ASSERT_NOCARRY (mpn_sub_n (v3or, v3pr, v3mr, k3)); } // ------------------------------------------------------------------------- // combine "normal" and "reciprocal" information // decompose he(B^2) and B^(2*(n3e-1)) * he(1/B^2) into base-B^2 digits zn_array_unpack_SAFE (zn, v3en, n3e + 1, 2 * b, 1, k3); zn_array_unpack_SAFE (zr, v3er, n3e + 1, 2 * b, a3 + 1, k3); // combine he(B^2) and he(1/B^2) information to get even coefficients of h zn_array_recover_reduce (res, 2, zn, zr, n3e, 2 * b, redc, mod); // decompose ho(B^2) and B^(2*(n3o-1)) * ho(1/B^2) into base-B^2 digits zn_array_unpack_SAFE (zn, v3on, n3o + 1, 2 * b, b + 1, k3); zn_array_unpack_SAFE (zr, v3or, n3o + 1, 2 * b, b - a3 + 1, k3); // combine ho(B^2) and ho(1/B^2) information to get odd coefficients of h zn_array_recover_reduce (res + 1, 2, zn, zr, n3o, 2 * b, redc, mod); ZNP_FASTFREE (z); ZNP_FASTFREE (limbs); }
/* Multiplication/squaring using Kronecker substitution at 2^b. */ void zn_array_mul_KS1 (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, int redc, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); ZNP_ASSERT (n1 <= ULONG_MAX); ZNP_ASSERT ((mod->m & 1) || !redc); int sqr = (op1 == op2 && n1 == n2); // length of h size_t n3 = n1 + n2 - 1; // bits in each output coefficient unsigned b = 2 * mod->bits + ceil_lg (n2); // number of ulongs required to store each output coefficient unsigned w = CEIL_DIV (b, ULONG_BITS); ZNP_ASSERT (w <= 3); // number of limbs needed to store f1(2^b) and f2(2^b) size_t k1 = CEIL_DIV (n1 * b, GMP_NUMB_BITS); size_t k2 = CEIL_DIV (n2 * b, GMP_NUMB_BITS); // allocate space ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 2 * (k1 + k2)); mp_limb_t* v1 = limbs; // k1 limbs mp_limb_t* v2 = v1 + k1; // k2 limbs mp_limb_t* v3 = v2 + k2; // k1 + k2 limbs if (!sqr) { // multiplication version // evaluate f1(2^b) and f2(2^b) zn_array_pack (v1, op1, n1, 1, b, 0, 0); zn_array_pack (v2, op2, n2, 1, b, 0, 0); // compute h(2^b) = f1(2^b) * f2(2^b) ZNP_mpn_mul (v3, v1, k1, v2, k2); } else { // squaring version // evaluate f1(2^b) zn_array_pack (v1, op1, n1, 1, b, 0, 0); // compute h(2^b) = f1(2^b)^2 ZNP_mpn_mul (v3, v1, k1, v1, k1); } // unpack coefficients of h, and reduce mod m ZNP_FASTALLOC (z, ulong, 6624, n3 * w); zn_array_unpack_SAFE (z, v3, n3, b, 0, k1 + k2); array_reduce (res, 1, z, n3, w, redc, mod); ZNP_FASTFREE (z); ZNP_FASTFREE (limbs); }
/* Multiplication/squaring using Kronecker substitution at 2^b and 2^(-b). Note: this routine does not appear to be competitive in practice with the other KS routines. It's here just for fun. */ void zn_array_mul_KS3 (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, int redc, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); ZNP_ASSERT (n1 <= ULONG_MAX); ZNP_ASSERT ((mod->m & 1) || !redc); int sqr = (op1 == op2 && n1 == n2); // length of h size_t n3 = n1 + n2 - 1; // bits in each output coefficient unsigned bits = 2 * mod->bits + ceil_lg (n2); // we're evaluating at x = B and 1/B, where B = 2^b, and b = ceil(bits / 2) unsigned b = (bits + 1) / 2; // number of ulongs required to store each base-B digit unsigned w = CEIL_DIV (b, ULONG_BITS); ZNP_ASSERT (w <= 2); // limbs needed to store f1(B) and B^(n1-1) * f1(1/B), ditto for f2 size_t k1 = CEIL_DIV (n1 * b, GMP_NUMB_BITS); size_t k2 = CEIL_DIV (n2 * b, GMP_NUMB_BITS); // allocate space ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 2 * (k1 + k2)); mp_limb_t* v1 = limbs; // k1 limbs mp_limb_t* v2 = v1 + k1; // k2 limbs mp_limb_t* v3 = v2 + k2; // k1 + k2 limbs ZNP_FASTALLOC (z, ulong, 6624, 2 * w * (n3 + 1)); // "n" = normal order, "r" = reciprocal order ulong* zn = z; ulong* zr = z + w * (n3 + 1); if (!sqr) { // multiplication version // evaluate f1(B) and f2(B) zn_array_pack (v1, op1, n1, 1, b, 0, k1); zn_array_pack (v2, op2, n2, 1, b, 0, k2); // compute h(B) = f1(B) * f2(B) ZNP_mpn_mul (v3, v1, k1, v2, k2); } else { // squaring version // evaluate f1(B) zn_array_pack (v1, op1, n1, 1, b, 0, k1); // compute h(B) = f1(B)^2 ZNP_mpn_mul (v3, v1, k1, v1, k1); } // decompose h(B) into base-B digits zn_array_unpack_SAFE (zn, v3, n3 + 1, b, 0, k1 + k2); if (!sqr) { // multiplication version // evaluate B^(n1-1) * f1(1/B) and B^(n2-1) * f2(1/B) zn_array_pack (v1, op1 + n1 - 1, n1, -1, b, 0, k1); zn_array_pack (v2, op2 + n2 - 1, n2, -1, b, 0, k2); // compute B^(n1+n2-2) * h(1/B) = // (B^(n1-1) * f1(1/B)) * (B^(n2-1) * f2(1/B)) ZNP_mpn_mul (v3, v1, k1, v2, k2); } else { // squaring version // evaluate B^(n1-1) * f1(1/B) zn_array_pack (v1, op1 + n1 - 1, n1, -1, b, 0, k1); // compute B^(2*n1-2) * h(1/B) = (B^(n1-1) * f1(1/B))^2 ZNP_mpn_mul (v3, v1, k1, v1, k1); } // decompose h(1/B) into base-B digits zn_array_unpack_SAFE (zr, v3, n3 + 1, b, 0, k1 + k2); // recover h(x) from h(B) and h(1/B) // (note: need to check that the high digit of each output coefficient // is < B - 1; this follows from an estimate in section 3.2 of [Har07].) zn_array_recover_reduce (res, 1, zn, zr, n3, b, redc, mod); ZNP_FASTFREE(z); ZNP_FASTFREE(limbs); }
/* Multiplication/squaring using Kronecker substitution at 2^b and -2^b. */ void zn_array_mul_KS2 (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, int redc, const zn_mod_t mod) { ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); ZNP_ASSERT (n1 <= ULONG_MAX); ZNP_ASSERT ((mod->m & 1) || !redc); if (n2 == 1) { // code below needs n2 > 1, so fall back on scalar multiplication _zn_array_scalar_mul (res, op1, n1, op2[0], redc, mod); return; } int sqr = (op1 == op2 && n1 == n2); // bits in each output coefficient unsigned bits = 2 * mod->bits + ceil_lg (n2); // we're evaluating at x = B and -B, where B = 2^b, and b = ceil(bits / 2) unsigned b = (bits + 1) / 2; // number of ulongs required to store each output coefficient unsigned w = CEIL_DIV (2 * b, ULONG_BITS); ZNP_ASSERT (w <= 3); // Write f1(x) = f1e(x^2) + x * f1o(x^2) // f2(x) = f2e(x^2) + x * f2o(x^2) // h(x) = he(x^2) + x * ho(x^2) // "e" = even, "o" = odd size_t n1o = n1 / 2; size_t n1e = n1 - n1o; size_t n2o = n2 / 2; size_t n2e = n2 - n2o; size_t n3 = n1 + n2 - 1; // length of h size_t n3o = n3 / 2; size_t n3e = n3 - n3o; // f1(B) and |f1(-B)| are at most ((n1 - 1) * b + mod->bits) bits long. // However, when evaluating f1e(B^2) and B * f1o(B^2) the bitpacking // routine needs room for the last chunk of 2b bits. Therefore we need to // allow room for (n1 + 1) * b bits. Ditto for f2. size_t k1 = CEIL_DIV ((n1 + 1) * b, GMP_NUMB_BITS); size_t k2 = CEIL_DIV ((n2 + 1) * b, GMP_NUMB_BITS); size_t k3 = k1 + k2; // allocate space ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 3 * k3); mp_limb_t* v1_buf0 = limbs; // k1 limbs mp_limb_t* v2_buf0 = v1_buf0 + k1; // k2 limbs mp_limb_t* v1_buf1 = v2_buf0 + k2; // k1 limbs mp_limb_t* v2_buf1 = v1_buf1 + k1; // k2 limbs mp_limb_t* v1_buf2 = v2_buf1 + k2; // k1 limbs mp_limb_t* v2_buf2 = v1_buf2 + k1; // k2 limbs // arrange overlapping buffers to minimise memory use // "p" = plus, "m" = minus mp_limb_t* v1e = v1_buf0; mp_limb_t* v2e = v2_buf0; mp_limb_t* v1o = v1_buf1; mp_limb_t* v2o = v2_buf1; mp_limb_t* v1p = v1_buf2; mp_limb_t* v2p = v2_buf2; mp_limb_t* v1m = v1_buf0; mp_limb_t* v2m = v2_buf0; mp_limb_t* v3m = v1_buf1; mp_limb_t* v3p = v1_buf0; mp_limb_t* v3e = v1_buf2; mp_limb_t* v3o = v1_buf0; ZNP_FASTALLOC (z, ulong, 6624, w * n3e); int v3m_neg; if (!sqr) { // multiplication version // evaluate f1e(B^2) and B * f1o(B^2) zn_array_pack (v1e, op1, n1e, 2, 2 * b, 0, k1); zn_array_pack (v1o, op1 + 1, n1o, 2, 2 * b, b, k1); // evaluate f2e(B^2) and B * f2o(B^2) zn_array_pack (v2e, op2, n2e, 2, 2 * b, 0, k2); zn_array_pack (v2o, op2 + 1, n2o, 2, 2 * b, b, k2); // compute f1(B) = f1e(B^2) + B * f1o(B^2) // and f2(B) = f2e(B^2) + B * f2o(B^2) ZNP_ASSERT_NOCARRY (mpn_add_n (v1p, v1e, v1o, k1)); ZNP_ASSERT_NOCARRY (mpn_add_n (v2p, v2e, v2o, k2)); // compute |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| // and |f2(-B)| = |f2e(B^2) - B * f2o(B^2)| v3m_neg = signed_mpn_sub_n (v1m, v1e, v1o, k1); v3m_neg ^= signed_mpn_sub_n (v2m, v2e, v2o, k2); // compute h(B) = f1(B) * f2(B) // compute |h(-B)| = |f1(-B)| * |f2(-B)| // v3m_neg is set if h(-B) is negative ZNP_mpn_mul (v3m, v1m, k1, v2m, k2); ZNP_mpn_mul (v3p, v1p, k1, v2p, k2); } else { // squaring version // evaluate f1e(B^2) and B * f1o(B^2) zn_array_pack (v1e, op1, n1e, 2, 2 * b, 0, k1); zn_array_pack (v1o, op1 + 1, n1o, 2, 2 * b, b, k1); // compute f1(B) = f1e(B^2) + B * f1o(B^2) ZNP_ASSERT_NOCARRY (mpn_add_n (v1p, v1e, v1o, k1)); // compute |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| signed_mpn_sub_n (v1m, v1e, v1o, k1); // compute h(B) = f1(B)^2 // compute h(-B) = f1(-B)^2 // v3m_neg is cleared (since f1(-B)^2 is never negative) ZNP_mpn_mul (v3m, v1m, k1, v1m, k1); ZNP_mpn_mul (v3p, v1p, k1, v1p, k1); v3m_neg = 0; } // he(B^2) and B * ho(B^2) are both at most b * (n3 + 1) bits long (since // the coefficients don't overlap). The buffers used below are at least // b * (n1 + n2 + 2) = b * (n3 + 3) bits long. So we definitely have // enough room for 2 * he(B^2) and 2 * B * ho(B^2). // compute 2 * he(B^2) = h(B) + h(-B) ZNP_ASSERT_NOCARRY (v3m_neg ? mpn_sub_n (v3e, v3p, v3m, k3) : mpn_add_n (v3e, v3p, v3m, k3)); // unpack coefficients of he, and reduce mod m zn_array_unpack_SAFE (z, v3e, n3e, 2 * b, 1, k3); array_reduce (res, 2, z, n3e, w, redc, mod); // compute 2 * b * ho(B^2) = h(B) - h(-B) ZNP_ASSERT_NOCARRY (v3m_neg ? mpn_add_n (v3o, v3p, v3m, k3) : mpn_sub_n (v3o, v3p, v3m, k3)); // unpack coefficients of ho, and reduce mod m zn_array_unpack_SAFE (z, v3o, n3o, 2 * b, b + 1, k3); array_reduce (res + 1, 2, z, n3o, w, redc, mod); ZNP_FASTFREE (z); ZNP_FASTFREE (limbs); }
void virtual_pmfvec_ifft (virtual_pmfvec_t vec, ulong n, int fwd, ulong t) { ZNP_ASSERT (vec->lgK <= vec->lgM + 1); ZNP_ASSERT (t * vec->K < 2 * vec->M); ZNP_ASSERT (n + fwd <= vec->K); if (vec->lgK == 0) return; vec->lgK--; vec->K >>= 1; const zn_mod_struct* mod = vec->mod; virtual_pmf_t* data = vec->data; ulong M = vec->M; ulong K = vec->K; ulong s, r = M >> vec->lgK; long i; if (n + fwd <= K) { for (i = K - 1; i >= (long) n; i--) { virtual_pmf_add (data[i], data[i + K]); virtual_pmf_divby2 (data[i]); } virtual_pmfvec_ifft (vec, n, fwd, t << 1); for (; i >= 0; i--) { virtual_pmf_add (data[i], data[i]); virtual_pmf_sub (data[i], data[i + K]); } } else { virtual_pmfvec_ifft (vec, K, 0, t << 1); for (i = K - 1, s = t + r * i; i >= (long)(n - K); i--, s -= r) { virtual_pmf_sub (data[i + K], data[i]); virtual_pmf_sub (data[i], data[i + K]); virtual_pmf_rotate (data[i + K], M + s); } vec->data += K; virtual_pmfvec_ifft (vec, n - K, fwd, t << 1); vec->data -= K; for (; i >= 0; i--, s -= r) { virtual_pmf_rotate (data[i + K], M - s); virtual_pmf_bfly (data[i + K], data[i]); } } vec->K <<= 1; vec->lgK++; }
void zn_array_mul_fft_dft (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, unsigned lgT, const zn_mod_t mod) { ZNP_ASSERT (mod->m & 1); ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); if (lgT == 0) { // no layers of DFT; just call usual FFT routine int sqr = (op1 == op2) && (n1 == n2); ulong x = zn_array_mul_fft_fudge (n1, n2, sqr, mod); zn_array_mul_fft (res, op1, n1, op2, n2, x, mod); return; } unsigned lgM, lgK; // number of pmf_t coefficients for each input poly ulong m1, m2; // figure out how big the transform needs to be mul_fft_params (&lgK, &lgM, &m1, &m2, n1, n2); // number of pmf_t coefficients for output poly ulong m = m1 + m2 - 1; ulong M = 1UL << lgM; ulong K = 1UL << lgK; ptrdiff_t skip = M + 1; size_t n3 = n1 + n2 - 1; // Split up transform into length K = U * T, i.e. U columns and T rows. if (lgT >= lgK) lgT = lgK; unsigned lgU = lgK - lgT; ulong U = 1UL << lgU; ulong T = 1UL << lgT; // space for two input rows, and one partial row pmfvec_t in1, in2, part; pmfvec_init (in1, lgU, skip, lgM, mod); pmfvec_init (in2, lgU, skip, lgM, mod); pmfvec_init (part, lgU, skip, lgM, mod); // the virtual pmfvec_t that we use for the column DFTs virtual_pmfvec_t col; virtual_pmfvec_init (col, lgT, lgM, mod); // zero the output zn_array_zero (res, n3); long i, j, k; int which; // Write m = U * mT + mU, where 0 <= mU < U ulong mU = m & (U - 1); ulong mT = m >> lgU; // for each row (beginning with the last partial row if it exists).... for (i = mT - (mU == 0); i >= 0; i--) { ulong i_rev = bit_reverse (i, lgT); // for each input array.... for (which = 0; which < 2; which++) { pmfvec_struct* in = which ? in2 : in1; const ulong* op = which ? op2 : op1; size_t n = which ? n2 : n1; pmf_t p = in->data; for (j = 0; j < U; j++, p += in->skip) { // compute the i-th row of the j-th column as it would look after // the column FFTs, using naive DFT pmf_zero (p, M); ulong r = i_rev << (lgM - lgT + 1); for (k = 0; k < T; k++) { merge_chunk_to_pmf (p, op, n, (k * U + j) << (lgM - 1), M, mod); pmf_rotate (p, -r); } pmf_rotate (p, (i_rev * j) << (lgM - lgK + 1)); } // Now we've got the whole row; run FFT on the row pmfvec_fft (in, (i == mT) ? mU : U, U, 0); } if (i == mT) { // pointwise multiply the two partial rows pmfvec_mul (part, in1, in2, mU, i == 0); // remove fudge factor pmfvec_scalar_mul (part, mU, pmfvec_mul_fudge (lgM, 0, mod)); // zero remainder of the partial row; we will subsequently add // in contributions from the vertical IFFTs when we process the other // rows. for (j = mU; j < U; j++) pmf_zero (part->data + part->skip * j, M); } else { // pointwise multiply the two rows pmfvec_mul (in1, in1, in2, U, i == 0); // remove fudge factor pmfvec_scalar_mul (in1, U, pmfvec_mul_fudge (lgM, 0, mod)); // horizontal IFFT this row pmfvec_ifft (in1, U, 0, U, 0); // simulate vertical IFFTs with DFTs for (j = 0; j < U; j++) { virtual_pmfvec_reset (col); virtual_pmf_import (col->data[i], in1->data + in1->skip * j); virtual_pmfvec_ifft (col, mT + (j < mU), (j >= mU) && mU, j << (lgM + 1 - lgK)); if ((j >= mU) && mU) { // add contribution to partial row (only for rightmost columns) pmf_t src = virtual_pmf_export (col->data[mT]); if (src) pmf_add (part->data + part->skip * j, src, M, mod); } // add contributions to output for (k = 0; k < mT + (j < mU); k++) merge_chunk_from_pmf (res, n3, virtual_pmf_export (col->data[k]), (k * U + j) * M/2, M, mod); } } } // now finish off the partial row if (mU) { // horizontal IFFT partial row pmfvec_ifft (part, mU, 0, U, 0); // simulate leftmost vertical IFFTs for (j = 0; j < mU; j++) { virtual_pmfvec_reset (col); virtual_pmf_import (col->data[mT], part->data + part->skip * j); virtual_pmfvec_ifft (col, mT + 1, 0, j << (lgM + 1 - lgK)); // add contributions to output for (k = 0; k <= mT; k++) merge_chunk_from_pmf (res, n3, virtual_pmf_export (col->data[k]), (k * U + j) * M/2, M, mod); } } // normalise result zn_array_scalar_mul (res, res, n3, zn_mod_pow2 (-lgK, mod), mod); virtual_pmfvec_clear (col); pmfvec_clear (part); pmfvec_clear (in2); pmfvec_clear (in1); }
void zn_array_mul_fft (ulong* res, const ulong* op1, size_t n1, const ulong* op2, size_t n2, ulong x, const zn_mod_t mod) { ZNP_ASSERT (mod->m & 1); ZNP_ASSERT (n2 >= 1); ZNP_ASSERT (n1 >= n2); unsigned lgK, lgM; // number of pmf_t coefficients for each input poly ulong m1, m2; // figure out how big the transform needs to be mul_fft_params (&lgK, &lgM, &m1, &m2, n1, n2); // number of pmf_t coefficients for output poly ulong m3 = m1 + m2 - 1; ulong M = 1UL << lgM; ulong K = 1UL << lgK; ptrdiff_t skip = M + 1; pmfvec_t vec1, vec2; int sqr = (op1 == op2 && n1 == n2); if (!sqr) { // multiplying two distinct inputs // split inputs into pmf_t's and perform FFTs pmfvec_init (vec1, lgK, skip, lgM, mod); fft_split (vec1, op1, n1, 0, 1, 0); pmfvec_fft (vec1, m3, m1, 0); // note: we apply the fudge factor here, because the second input is // shorter than both the first input and the output :-) pmfvec_init (vec2, lgK, skip, lgM, mod); fft_split (vec2, op2, n2, 0, x, 0); pmfvec_fft (vec2, m3, m2, 0); // pointwise multiplication pmfvec_mul (vec1, vec1, vec2, m3, 1); pmfvec_clear (vec2); } else { // squaring a single input // split input into pmf_t's and perform FFTs pmfvec_init (vec1, lgK, skip, lgM, mod); fft_split (vec1, op1, n1, 0, 1, 0); pmfvec_fft (vec1, m3, m1, 0); // pointwise multiplication pmfvec_mul (vec1, vec1, vec1, m3, 1); } // inverse FFT, and write output pmfvec_ifft (vec1, m3, 0, m3, 0); size_t n3 = n1 + n2 - 1; fft_combine (res, n3, vec1, m3, 0); pmfvec_clear (vec1); // if we're squaring, then we haven't applied the fudge factor yet, // so do it now if (sqr) zn_array_scalar_mul_or_copy (res, res, n3, x, mod); }