Beispiel #1
0
void
mulmid_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2,
                   ulong* p, size_t n1, size_t n2)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);

   unsigned _lgM;
   size_t _m1;
   ulong M, _p;

   // increase lgM until all the conditions are satisfied
   for (_lgM = 1; ; _lgM++)
   {
      M = 1UL << _lgM;
      _p = ((-n2) & (M/2 - 1)) + 1;
      _m1 = CEIL_DIV_2EXP (n1 + _p, _lgM - 1);
      if (_m1 <= 2 * M)
         break;
   }

   *lgM = _lgM;
   *lgK = (_m1 > M) ? (_lgM + 1) : _lgM;
   *p = _p;
   *m1 = _m1;
   *m2 = CEIL_DIV_2EXP (n2, _lgM - 1);
}
Beispiel #2
0
void
zn_array_invert (ulong* res, const ulong* op, size_t n, const zn_mod_t mod)
{
   ZNP_ASSERT (n >= 1);
   
   // for now assume input is monic
   ZNP_ASSERT (op[0] == 1);
   
   if (n == 1)
   {
      res[0] = 1;
      return;
   }
   
   size_t half = (n + 1) / 2;    // ceil(n / 2)
   
   // recursively obtain the first half of the output
   zn_array_invert (res, op, half, mod);

   // extend to second half of the output
   if (mod->m & 1)
      zn_array_invert_extend_fft (res + half, res, op, half, n - half, mod);
   else
      zn_array_invert_extend (res + half, res, op, half, n - half, mod);
}
Beispiel #3
0
/*
   Helper function for ref_zn_array_unpack().

   Inverse operation of ref_zn_array_pack_helper(); each output coefficient
   occupies ceil(b / ULONG_BITS) ulongs.
   
   Running time is soft-linear in output length.
*/
void
ref_zn_array_unpack_helper (ulong* res, const mpz_t op, size_t n, unsigned b,
                            unsigned k)
{
   ZNP_ASSERT (n >= 1);
   ZNP_ASSERT (mpz_sizeinbase (op, 2) <= n * b + k);

   unsigned w = CEIL_DIV (b, ULONG_BITS);

   mpz_t y;
   mpz_init (y);
   
   if (n == 1)
   {
      // base case
      unsigned i;
      mpz_tdiv_q_2exp (y, op, k);
      for (i = 0; i < w; i++)
      {
         res[i] = mpz_get_ui (y);
         mpz_tdiv_q_2exp (y, y, ULONG_BITS);
      }
   }
   else
   {
      // recursively split into top and bottom halves
      mpz_tdiv_q_2exp (y, op, (n / 2) * b + k);
      ref_zn_array_unpack_helper (res + w * (n / 2), y, n - n / 2, b, 0);
      mpz_tdiv_r_2exp (y, op, (n / 2) * b + k);
      ref_zn_array_unpack_helper (res, y, n / 2, b, k);
   }
   
   mpz_clear (y);
}
Beispiel #4
0
void
zn_array_mulmid_fft_precomp1_init (zn_array_mulmid_fft_precomp1_t res,
                                   const ulong* op1, size_t n1, size_t n2,
                                   ulong x, const zn_mod_t mod)
{
   ZNP_ASSERT (mod->m & 1);
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);

   res->n1 = n1;
   res->n2 = n2;
   
   unsigned lgK, lgM;
   
   mulmid_fft_params (&lgK, &lgM, &res->m1, &res->m2, &res->p, n1, n2);

   ulong M = 1UL << lgM;
   ptrdiff_t skip = M + 1;

   // allocate space for transposed IFFT
   pmfvec_init (res->vec1, lgK, skip, lgM, mod);
   
   // split input, with padding, in reversed order, and apply requested
   // scaling factor
   pmfvec_reverse (res->vec1, res->m1);
   fft_split (res->vec1, op1, n1, res->p, x, 0);
   pmfvec_reverse (res->vec1, res->m1);
   
   // transposed IFFT first input
   pmfvec_tpifft (res->vec1, res->m1, 0, res->m1, 0);
}
Beispiel #5
0
void
zn_array_mulmid_fft (ulong* res,
                     const ulong* op1, size_t n1,
                     const ulong* op2, size_t n2,
                     ulong x, const zn_mod_t mod)
{
   ZNP_ASSERT (mod->m & 1);
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   
   // re-use the precomp1 code
   zn_array_mulmid_fft_precomp1_t precomp;
   zn_array_mulmid_fft_precomp1_init (precomp, op1, n1, n2, x, mod);
   zn_array_mulmid_fft_precomp1_execute (res, op2, 1, precomp);
   zn_array_mulmid_fft_precomp1_clear (precomp);
}
Beispiel #6
0
void
virtual_pmf_bfly (virtual_pmf_t op1, virtual_pmf_t op2)
{
   ZNP_ASSERT (op1->parent == op2->parent);
   struct virtual_pmfvec_struct* parent = op1->parent;
   
   // op1 == 0
   if (op1->index == -1)
   {
      virtual_pmf_set (op1, op2);
      return;
   }
   
   // op2 == 0
   if (op2->index == -1)
   {
      virtual_pmf_set (op2, op1);
      virtual_pmf_rotate (op2, parent->M);
      return;
   }

   virtual_pmf_isolate (op1);
   virtual_pmf_isolate (op2);

   pmf_t p1 = parent->buf[op1->index];
   pmf_t p2 = parent->buf[op2->index];

   p1[0] = op1->bias;
   p2[0] = op2->bias;

   pmf_bfly (p1, p2, parent->M, parent->mod);
}
Beispiel #7
0
void
virtual_pmf_sub (virtual_pmf_t res, virtual_pmf_t op)
{
   ZNP_ASSERT (res->parent == op->parent);
   struct virtual_pmfvec_struct* parent = res->parent;

   // op == 0
   if (op->index == -1)
      return;
      
   // res == 0
   if (res->index == -1)
   {
      virtual_pmf_set (res, op);
      virtual_pmf_rotate (res, parent->M);
      return;
   }
   
   virtual_pmf_isolate (res);

   pmf_t p2 = parent->buf[res->index];
   pmf_t p1 = parent->buf[op->index];

   p2[0] = res->bias;
   p1[0] = op->bias;

   pmf_sub (p2, p1, parent->M, parent->mod);
}
Beispiel #8
0
/*
   tests zn_array_pack() once for given n, b, k
*/
int
testcase_zn_array_pack (size_t n, unsigned b, unsigned k)
{
   ZNP_ASSERT (b >= 1);
   ZNP_ASSERT (n >= 1);

   int success = 1;
   ulong* in = (ulong*) malloc (sizeof (ulong) * n);

   size_t size = CEIL_DIV (n * b + k, GMP_NUMB_BITS);
   mp_limb_t* res = (mp_limb_t*) malloc (sizeof (mp_limb_t) * (size + 2));
   mp_limb_t* ref = (mp_limb_t*) malloc (sizeof (mp_limb_t) * (size + 2));

   // sentries to check buffer overflow
   res[0] = res[size + 1] = ref[0] = ref[size + 1] = 0x1234;

   // generate random data: at most b bits per input coefficient, possibly less
   unsigned rand_bits = (b >= ULONG_BITS) ? ULONG_BITS : b;
   rand_bits = random_ulong (rand_bits) + 1;
   ulong max = (rand_bits == ULONG_BITS)
                        ? ((ulong)(-1)) : ((1UL << rand_bits) - 1);
   size_t i;
   for (i = 0; i < n; i++)
      in[i] = random_ulong (max);

   // run target and reference implementation
   zn_array_pack (res + 1, in, n, 1, b, k, 0);
   ref_zn_array_pack (ref + 1, in, n, b, k);
   
   // check sentries
   success = success && (res[0] == 0x1234);
   success = success && (ref[0] == 0x1234);
   success = success && (res[size + 1] == 0x1234);
   success = success && (ref[size + 1] == 0x1234);
   // check correct result
   success = success && (mpn_cmp (res + 1, ref + 1, size) == 0);

   free (ref);
   free (res);
   free (in);
   
   return success;
}
Beispiel #9
0
unsigned
virtual_pmfvec_find_slot (virtual_pmfvec_t vec)
{
   unsigned i;
   for (i = 0; i < vec->max_buffers; i++)
      if (!vec->buf[i])
         return i;
   
   // this should never happen; we always should have enough slots
   ZNP_ASSERT (0);
}
Beispiel #10
0
void
zn_array_pack1 (mp_limb_t* res, const ulong* op, size_t n, ptrdiff_t s,
                unsigned b, unsigned k, size_t r)
{
   ZNP_ASSERT (b > 0 && b <= ULONG_BITS);
   
#if GMP_NAIL_BITS == 0  &&  ULONG_BITS == GMP_NUMB_BITS

   // where to write the next limb
   mp_limb_t* dest = res;
   
   // write leading zero-padding
   while (k >= ULONG_BITS)
   {
      *dest++ = 0;
      k -= ULONG_BITS;
   }

   // limb currently being filled
   mp_limb_t buf = 0;
   // number of bits used in buf; always in [0, ULONG_BITS)
   unsigned buf_b = k;
   unsigned buf_b_old;
   
   for (; n > 0; n--, op += s)
   {
      ZNP_ASSERT (b >= ULONG_BITS  ||  *op < (1UL << b));
      
      // put low bits of current input into buffer
      buf += *op << buf_b;
      buf_b_old = buf_b;
      buf_b += b;
      if (buf_b >= ULONG_BITS)
      {
         // buffer is full; flush it
         *dest++ = buf;
         buf_b -= ULONG_BITS;
         // put remaining bits of current input into buffer
         buf = buf_b_old ? (*op >> (ULONG_BITS - buf_b_old)) : 0;
      }
   }
Beispiel #11
0
void
zn_array_invert_extend (ulong* res, const ulong* approx, const ulong* op,
                        size_t n1, size_t n2, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);

   // The algorithm is basically newton iteration, inspired partly by the
   // algorithm in [HZ04], as follows.

   // Let f be the input series, of length n1 + n2.
   // Let g be the current approximation to 1/f, of length n1.
   
   // By newton iteration, (2*g - g*g*f) is a length n1 + n2 approximation
   // to 1/f. Therefore the output of this function should be terms
   // [n1, n1 + n2) of -g*g*f.
   
   // We have g*f = 1 + h*x^n1 + O(x^(n1 + n2)), where h has length n2,
   // i.e. h consists of terms [n1, n1 + n2) of g*f. Therefore h may be
   // recovered as the middle product of f[1, n1 + n2) and g[0, n1).
   
   // Then g*g*f = g + g*h*x^n1 + O(x^(n1 + n2)). Since g has length
   // n1, the output is (the negative of) the first n2 coefficients of g*h.


   // Compute h, put it in res[0, n2).
   zn_array_mulmid (res, op + 1, n1 + n2 - 1, approx, n1, mod);
   
   // Compute g * h, put it into a scratch buffer.
   ZNP_FASTALLOC (temp, ulong, 6624, n1 + n2 - 1);
   zn_array_mul (temp, approx, n1, res, n2, mod);
   
   // Negate the first n2 coefficients of g * h into the output buffer.
   zn_array_neg (res, temp, n2, mod);
   ZNP_FASTFREE (temp);
}
Beispiel #12
0
void
merge_chunk_to_pmf (pmf_t res, const ulong* op, size_t n, size_t k, ulong M,
                    const zn_mod_t mod)
{
   ZNP_ASSERT ((M & 1) == 0);
   
   ulong r = (-res[0]) & (2*M - 1);
   
   size_t end = k + M/2;
   if (end > n)
      end = n;
   if (k >= end)
      // nothing to do
      return;
   
   op += k;
   ulong size = end - k;
   // now we need to handle op[0, size), and we are guaranteed size <= M/2.

   if (r < M)
   {
      if (size <= M - r)
         zn_array_add_inplace (res + 1 + r, op, size, mod);
      else
      {
         zn_array_add_inplace (res + 1 + r, op, M - r, mod);
         // negacyclic wraparound:
         zn_array_sub_inplace (res + 1, op + M - r, size - M + r, mod);
      }
   }
   else
   {
      r -= M;

      if (size <= M - r)
         zn_array_sub_inplace (res + 1 + r, op, size, mod);
      else
      {
         zn_array_sub_inplace (res + 1 + r, op, M - r, mod);
         // negacyclic wraparound:
         zn_array_add_inplace (res + 1, op + M - r, size - M + r, mod);
      }
   }
}
Beispiel #13
0
/*
   Helper function for ref_zn_array_pack().

   Sets x = 2^k * (op[0] + op[1]*2^b + ... + op[n-1]*2^((n-1)*b)).
   
   Running time is soft-linear in output length.
*/
void
ref_zn_array_pack_helper (mpz_t x, const ulong* op, size_t n, unsigned b,
                          unsigned k)
{
   ZNP_ASSERT (n >= 1);

   if (n == 1)
   {
      // base case
      mpz_set_ui (x, op[0]);
      mpz_mul_2exp (x, x, k);
   }
   else
   {
      // recursively split into top and bottom halves
      mpz_t y;
      mpz_init (y);
      ref_zn_array_pack_helper (x, op, n / 2, b, k);
      ref_zn_array_pack_helper (y, op + n / 2, n - n / 2, b, 0);
      mpz_mul_2exp (y, y, (n / 2) * b + k);
      mpz_add (x, x, y);
      mpz_clear (y);
   }
}
Beispiel #14
0
void
zn_array_invert_extend_fft (ulong* res, const ulong* approx, const ulong* op,
                            size_t n1, size_t n2, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   ZNP_ASSERT (mod->m & 1);

   // The algorithm here is the same as in zn_array_invert_extend(), except
   // that we work with the FFTs directly. This allows us to save one FFT,
   // since we use the FFT of g in both the middle product step and the
   // product step.

   // Determine FFT parameters for computing h = middle product of
   // f[1, n1 + n2) and g[0, n1). (These parameters will also work for the
   // subsequent product g * h.)
   unsigned lgK, lgM; 
   ulong m1, m2, m3, p;

   mulmid_fft_params (&lgK, &lgM, &m3, &m1, &p, n1 + n2 - 1, n1);
   m2 = m3 - m1 + 1;

   // We now have
   //     m1 = ceil(n1 / (M/2))
   //        = (n1 + p - 1) / (M/2).
   // Therefore
   //     m3 = ceil((n1 + n2 - 1 + p) / (M/2))
   //        = ceil(n2 / (M/2)) + (n1 + p - 1) / (M/2)
   // and
   //     m2 = ceil(n2 / (M/2)) + 1.
   
   ulong M = 1UL << lgM;
   ulong K = 1UL << lgK;
   ptrdiff_t skip = M + 1;

   pmfvec_t vec1, vec2;
   pmfvec_init (vec1, lgK, skip, lgM, mod);
   pmfvec_init (vec2, lgK, skip, lgM, mod);

   // Find scaling factor that needs to be applied to both of the products
   // below; takes into account the fudge from the pointwise multiplies, and
   // the division by 2^lgK coming from the FFTs.
   ulong x = pmfvec_mul_fudge (lgM, 0, mod);
   x = zn_mod_mul (x, zn_mod_pow2 (-lgK, mod), mod);

   // Split g[0, n1) into m1 coefficients, apply scaling factor, and compute
   // m3 fourier coefficients, written to vec2.
   fft_split (vec2, approx, n1, 0, x, 0);
   pmfvec_fft (vec2, m3, m1, 0);

   // Split f[1, n1 + n2) into m3 coefficients (in reversed order, with
   // appropriate zero-padding), and compute transposed IFFT of length m3,
   // written to vec1.
   pmfvec_reverse (vec1, m3);
   fft_split (vec1, op + 1, n1 + n2 - 1, p, 1, 0);
   pmfvec_reverse (vec1, m3);
   pmfvec_tpifft (vec1, m3, 0, m3, 0);

   // Pointwise multiply the above FFT and transposed IFFT, into vec1.
   pmfvec_mul (vec1, vec1, vec2, m3, 0);
   
   // Transposed FFT vec1, obtaining m2 coefficients, then reverse and combine.
   pmfvec_tpfft (vec1, m3, m2, 0);
   pmfvec_reverse (vec1, m2);
   fft_combine (res, n2, vec1, m2, 1);
   pmfvec_reverse (vec1, m2);
   
   // At this stage we have obtained the polynomial h in res[0, n2).
   // Now we must compute h * g.
   
   // Split h[0, n2) into m2 - 1 coefficients, and compute m3 - 1 fourier
   // coefficients in vec1. For the splitting step, we set the bias to M,
   // which effectively negates everything, so we're really computing the FFT
   // of -h.
   fft_split (vec1, res, n2, 0, 1, M);
   pmfvec_fft (vec1, m3 - 1, m2 - 1, 0);

   // Pointwise multiply that FFT with the first FFT of g into vec2.
   pmfvec_mul (vec2, vec2, vec1, m3 - 1, 1);
   pmfvec_clear (vec1);

   // IFFT and combine, to obtain the product -h * g. We only need the low n2
   // terms of the product (we throw away the high n1 - 1 terms).
   pmfvec_ifft (vec2, m3 - 1, 0, m3 - 1, 0);
   fft_combine (res, n2, vec2, m3 - 1, 0);
   pmfvec_clear (vec2);
}
Beispiel #15
0
/*
   Multiplication/squaring using Kronecker substitution at 2^b, -2^b,
   2^(-b) and -2^(-b).
*/
void
zn_array_mul_KS4 (ulong* res,
                  const ulong* op1, size_t n1,
                  const ulong* op2, size_t n2,
                  int redc, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   ZNP_ASSERT (n1 <= ULONG_MAX);
   ZNP_ASSERT ((mod->m & 1) || !redc);

   if (n2 == 1)
   {
      // code below needs n2 > 1, so fall back on scalar multiplication
      _zn_array_scalar_mul (res, op1, n1, op2[0], redc, mod);
      return;
   }

   int sqr = (op1 == op2 && n1 == n2);

   // bits in each output coefficient
   unsigned bits = 2 * mod->bits + ceil_lg (n2);
   
   // we're evaluating at x = B, -B, 1/B, -1/B,
   // where B = 2^b, and b = ceil(bits / 4)
   unsigned b = (bits + 3) / 4;

   // number of ulongs required to store each base-B^2 digit
   unsigned w = CEIL_DIV (2 * b, ULONG_BITS);
   ZNP_ASSERT (w <= 2);

   // Write f1(x) = f1e(x^2) + x * f1o(x^2)
   //       f2(x) = f2e(x^2) + x * f2o(x^2)
   //        h(x) =  he(x^2) + x *  ho(x^2)
   // "e" = even, "o" = odd

   size_t n1o = n1 / 2;
   size_t n1e = n1 - n1o;

   size_t n2o = n2 / 2;
   size_t n2e = n2 - n2o;

   size_t n3 = n1 + n2 - 1;   // length of h
   size_t n3o = n3 / 2;
   size_t n3e = n3 - n3o;

   // Put k1 = number of limbs needed to store f1(B) and |f1(-B)|.
   // In f1(B), the leading coefficient starts at bit position b * (n1 - 1)
   // and has length 2b, and the coefficients overlap so we need an extra bit
   // for the carry: this gives (n1 + 1) * b + 1 bits. Ditto for f2.
   size_t k1 = CEIL_DIV ((n1 + 1) * b + 1, GMP_NUMB_BITS);
   size_t k2 = CEIL_DIV ((n2 + 1) * b + 1, GMP_NUMB_BITS);
   size_t k3 = k1 + k2;

   // allocate space
   ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 5 * k3);
   mp_limb_t* v1_buf0 = limbs;           // k1 limbs
   mp_limb_t* v2_buf0 = v1_buf0 + k1;    // k2 limbs
   mp_limb_t* v1_buf1 = v2_buf0 + k2;    // k1 limbs
   mp_limb_t* v2_buf1 = v1_buf1 + k1;    // k2 limbs
   mp_limb_t* v1_buf2 = v2_buf1 + k2;    // k1 limbs
   mp_limb_t* v2_buf2 = v1_buf2 + k1;    // k2 limbs
   mp_limb_t* v1_buf3 = v2_buf2 + k2;    // k1 limbs
   mp_limb_t* v2_buf3 = v1_buf3 + k1;    // k2 limbs
   mp_limb_t* v1_buf4 = v2_buf3 + k2;    // k1 limbs
   mp_limb_t* v2_buf4 = v1_buf4 + k1;    // k2 limbs

   // arrange overlapping buffers to minimise memory use
   // "p" = plus, "m" = minus
   // "n" = normal order, "r" = reciprocal order
   mp_limb_t* v1en = v1_buf0;
   mp_limb_t* v1on = v1_buf1;
   mp_limb_t* v1pn = v1_buf2;
   mp_limb_t* v1mn = v1_buf0;
   mp_limb_t* v2en = v2_buf0;
   mp_limb_t* v2on = v2_buf1;
   mp_limb_t* v2pn = v2_buf2;
   mp_limb_t* v2mn = v2_buf0;
   mp_limb_t* v3pn = v1_buf1;
   mp_limb_t* v3mn = v1_buf2;
   mp_limb_t* v3en = v1_buf0;
   mp_limb_t* v3on = v1_buf1;

   mp_limb_t* v1er = v1_buf2;
   mp_limb_t* v1or = v1_buf3;
   mp_limb_t* v1pr = v1_buf4;
   mp_limb_t* v1mr = v1_buf2;
   mp_limb_t* v2er = v2_buf2;
   mp_limb_t* v2or = v2_buf3;
   mp_limb_t* v2pr = v2_buf4;
   mp_limb_t* v2mr = v2_buf2;
   mp_limb_t* v3pr = v1_buf3;
   mp_limb_t* v3mr = v1_buf4;
   mp_limb_t* v3er = v1_buf2;
   mp_limb_t* v3or = v1_buf3;
   
   ZNP_FASTALLOC (z, ulong, 6624, 2 * w * (n3e + 1));
   ulong* zn = z;
   ulong* zr = z + w * (n3e + 1);

   int v3m_neg;

   // -------------------------------------------------------------------------
   //     "normal" evaluation points
   
   if (!sqr)
   {
      // multiplication version

      // evaluate f1e(B^2) and B * f1o(B^2)
      // We need max(2 * b*n1e, 2 * b*n1o + b) bits for this packing step,
      // which is safe since (n1 + 1) * b + 1 >= max(2 * b*n1e, 2 * b*n1o + b).
      // Ditto for f2 below.
      zn_array_pack (v1en, op1, n1e, 2, 2 * b, 0, k1);
      zn_array_pack (v1on, op1 + 1, n1o, 2, 2 * b, b, k1);

      // compute  f1(B)  =  f1e(B^2) + B * f1o(B^2)
      //    and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1pn, v1en, v1on, k1));
      v3m_neg = signed_mpn_sub_n (v1mn, v1en, v1on, k1);

      // evaluate f2e(B^2) and B * f2o(B^2)
      zn_array_pack (v2en, op2, n2e, 2, 2 * b, 0, k2);
      zn_array_pack (v2on, op2 + 1, n2o, 2, 2 * b, b, k2);
      
      // compute  f2(B)  =  f2e(B^2) + B * f2o(B^2)
      //    and |f2(-B)| = |f2e(B^2) - B * f2o(B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v2pn, v2en, v2on, k2));
      v3m_neg ^= signed_mpn_sub_n (v2mn, v2en, v2on, k2);

      // compute  h(B)  =  f1(B)   *  f2(B)
      //    and |h(-B)| = |f1(-B)| * |f2(-B)|
      // hn_neg is set if h(-B) is negative
      ZNP_mpn_mul (v3pn, v1pn, k1, v2pn, k2);
      ZNP_mpn_mul (v3mn, v1mn, k1, v2mn, k2);
   }
   else
   {
      // squaring version

      // evaluate f1e(B^2) and B * f1o(B^2)
      zn_array_pack (v1en, op1, n1e, 2, 2 * b, 0, k1);
      zn_array_pack (v1on, op1 + 1, n1o, 2, 2 * b, b, k1);

      // compute  f1(B)  =  f1e(B^2) + B * f1o(B^2)
      //    and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1pn, v1en, v1on, k1));
      signed_mpn_sub_n (v1mn, v1en, v1on, k1);

      // compute h(B) =  f1(B)^2
      //    and h(-B) = |f1(-B)|^2
      // hn_neg is cleared since h(-B) is never negative
      ZNP_mpn_mul (v3pn, v1pn, k1, v1pn, k1);
      ZNP_mpn_mul (v3mn, v1mn, k1, v1mn, k1);
      v3m_neg = 0;
   }

   // Each coefficient of h(B) is up to 4b bits long, so h(B) needs at most
   // ((n1 + n2 + 2) * b + 1) bits. (The extra +1 is to accommodate carries
   // generated by overlapping coefficients.)  The buffer has at least
   // ((n1 + n2 + 2) * b + 2) bits. Therefore we can safely store 2*h(B) etc.

   // compute     2 * he(B^2) = h(B) + h(-B)
   // and     B * 2 * ho(B^2) = h(B) - h(-B)
   if (v3m_neg)
   {
      ZNP_ASSERT_NOCARRY (mpn_sub_n (v3en, v3pn, v3mn, k3));
      ZNP_ASSERT_NOCARRY (mpn_add_n (v3on, v3pn, v3mn, k3));
   }
   else
   {
      ZNP_ASSERT_NOCARRY (mpn_add_n (v3en, v3pn, v3mn, k3));
      ZNP_ASSERT_NOCARRY (mpn_sub_n (v3on, v3pn, v3mn, k3));
   }

   // -------------------------------------------------------------------------
   //     "reciprocal" evaluation points

   // correction factors to take into account that if a polynomial has even
   // length, its even and odd coefficients are swapped when the polynomial
   // is reversed
   unsigned a1 = (n1 & 1) ? 0 : b;
   unsigned a2 = (n2 & 1) ? 0 : b;
   unsigned a3 = (n3 & 1) ? 0 : b;

   if (!sqr)
   {
      // multiplication version
   
      // evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2)
      zn_array_pack (v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1);
      zn_array_pack (v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1);

      // compute  B^(n1-1) * f1(1/B) =
      //              B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2)
      //    and  |B^(n1-1) * f1(-1/B)| =
      //             |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1pr, v1er, v1or, k1));
      v3m_neg = signed_mpn_sub_n (v1mr, v1er, v1or, k1);

      // evaluate B^(n2-1) * f2e(1/B^2) and B^(n2-2) * f2o(1/B^2)
      zn_array_pack (v2er, op2 + 2*(n2e - 1), n2e, -2, 2 * b, a2, k2);
      zn_array_pack (v2or, op2 + 1 + 2*(n2o - 1), n2o, -2, 2 * b, b - a2, k2);

      // compute  B^(n2-1) * f2(1/B) =
      //              B^(n2-1) * f2e(1/B^2) + B^(n2-2) * f2o(1/B^2)
      //    and  |B^(n1-1) * f2(-1/B)| =
      //             |B^(n2-1) * f2e(1/B^2) - B^(n2-2) * f2o(1/B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v2pr, v2er, v2or, k2));
      v3m_neg ^= signed_mpn_sub_n (v2mr, v2er, v2or, k2);

      // compute B^(n3-1) * h(1/B) =
      //                 (B^(n1-1) * f1(1/B)) * (B^(n2-1) * f2(1/B))
      //     and |B^(n3-1) * h(-1/B)| =
      //                 |B^(n1-1) * f1(-1/B)| * |B^(n2-1) * f2(-1/B)|
      // hr_neg is set if h(-1/B) is negative
      ZNP_mpn_mul (v3pr, v1pr, k1, v2pr, k2);
      ZNP_mpn_mul (v3mr, v1mr, k1, v2mr, k2);
   }
   else
   {
      // squaring version

      // evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2)
      zn_array_pack (v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1);
      zn_array_pack (v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1);

      // compute  B^(n1-1) * f1(1/B) =
      //              B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2)
      //    and  |B^(n1-1) * f1(-1/B)| =
      //             |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)|
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1pr, v1er, v1or, k1));
      signed_mpn_sub_n (v1mr, v1er, v1or, k1);

      // compute B^(n3-1) * h(1/B)  = (B^(n1-1) * f1(1/B))^2
      //     and B^(n3-1) * h(-1/B) = |B^(n1-1) * f1(-1/B)|^2
      // hr_neg is cleared since h(-1/B) is never negative
      ZNP_mpn_mul (v3pr, v1pr, k1, v1pr, k1);
      ZNP_mpn_mul (v3mr, v1mr, k1, v1mr, k1);
      v3m_neg = 0;
   }

   // compute 2 * B^(n3-1) * he(1/B^2)
   //                = B^(n3-1) * h(1/B) + B^(n3-1) * h(-1/B)
   //    and  2 * B^(n3-2) * ho(1/B^2)
   //                = B^(n3-1) * h(1/B) - B^(n3-1) * h(-1/B)
   if (v3m_neg)
   {
      ZNP_ASSERT_NOCARRY (mpn_sub_n (v3er, v3pr, v3mr, k3));
      ZNP_ASSERT_NOCARRY (mpn_add_n (v3or, v3pr, v3mr, k3));
   }
   else
   {
      ZNP_ASSERT_NOCARRY (mpn_add_n (v3er, v3pr, v3mr, k3));
      ZNP_ASSERT_NOCARRY (mpn_sub_n (v3or, v3pr, v3mr, k3));
   }

   // -------------------------------------------------------------------------
   //     combine "normal" and "reciprocal" information

   // decompose he(B^2) and B^(2*(n3e-1)) * he(1/B^2) into base-B^2 digits
   zn_array_unpack_SAFE (zn, v3en, n3e + 1, 2 * b, 1, k3);
   zn_array_unpack_SAFE (zr, v3er, n3e + 1, 2 * b, a3 + 1, k3);
   
   // combine he(B^2) and he(1/B^2) information to get even coefficients of h
   zn_array_recover_reduce (res, 2, zn, zr, n3e, 2 * b, redc, mod);

   // decompose ho(B^2) and B^(2*(n3o-1)) * ho(1/B^2) into base-B^2 digits
   zn_array_unpack_SAFE (zn, v3on, n3o + 1, 2 * b, b + 1, k3);
   zn_array_unpack_SAFE (zr, v3or, n3o + 1, 2 * b, b - a3 + 1, k3);

   // combine ho(B^2) and ho(1/B^2) information to get odd coefficients of h
   zn_array_recover_reduce (res + 1, 2, zn, zr, n3o, 2 * b, redc, mod);
   
   ZNP_FASTFREE (z);
   ZNP_FASTFREE (limbs);
}
Beispiel #16
0
/*
   Multiplication/squaring using Kronecker substitution at 2^b.
*/
void
zn_array_mul_KS1 (ulong* res,
                  const ulong* op1, size_t n1,
                  const ulong* op2, size_t n2,
                  int redc, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   ZNP_ASSERT (n1 <= ULONG_MAX);
   ZNP_ASSERT ((mod->m & 1) || !redc);

   int sqr = (op1 == op2 && n1 == n2);

   // length of h
   size_t n3 = n1 + n2 - 1;
   
   // bits in each output coefficient
   unsigned b = 2 * mod->bits + ceil_lg (n2);
   
   // number of ulongs required to store each output coefficient
   unsigned w = CEIL_DIV (b, ULONG_BITS);
   ZNP_ASSERT (w <= 3);

   // number of limbs needed to store f1(2^b) and f2(2^b)
   size_t k1 = CEIL_DIV (n1 * b, GMP_NUMB_BITS);
   size_t k2 = CEIL_DIV (n2 * b, GMP_NUMB_BITS);

   // allocate space
   ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 2 * (k1 + k2));
   mp_limb_t* v1 = limbs;     // k1 limbs
   mp_limb_t* v2 = v1 + k1;   // k2 limbs
   mp_limb_t* v3 = v2 + k2;   // k1 + k2 limbs

   if (!sqr)
   {
      // multiplication version

      // evaluate f1(2^b) and f2(2^b)
      zn_array_pack (v1, op1, n1, 1, b, 0, 0);
      zn_array_pack (v2, op2, n2, 1, b, 0, 0);

      // compute h(2^b) = f1(2^b) * f2(2^b)
      ZNP_mpn_mul (v3, v1, k1, v2, k2);
   }
   else
   {
      // squaring version

      // evaluate f1(2^b)
      zn_array_pack (v1, op1, n1, 1, b, 0, 0);

      // compute h(2^b) = f1(2^b)^2
      ZNP_mpn_mul (v3, v1, k1, v1, k1);
   }

   // unpack coefficients of h, and reduce mod m
   ZNP_FASTALLOC (z, ulong, 6624, n3 * w);
   zn_array_unpack_SAFE (z, v3, n3, b, 0, k1 + k2);
   array_reduce (res, 1, z, n3, w, redc, mod);

   ZNP_FASTFREE (z);
   ZNP_FASTFREE (limbs);
}
Beispiel #17
0
/*
   Multiplication/squaring using Kronecker substitution at 2^b and 2^(-b).
   
   Note: this routine does not appear to be competitive in practice with the
   other KS routines. It's here just for fun.
*/
void
zn_array_mul_KS3 (ulong* res,
                  const ulong* op1, size_t n1,
                  const ulong* op2, size_t n2,
                  int redc, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   ZNP_ASSERT (n1 <= ULONG_MAX);
   ZNP_ASSERT ((mod->m & 1) || !redc);

   int sqr = (op1 == op2 && n1 == n2);

   // length of h
   size_t n3 = n1 + n2 - 1;
   
   // bits in each output coefficient
   unsigned bits = 2 * mod->bits + ceil_lg (n2);
   
   // we're evaluating at x = B and 1/B, where B = 2^b, and b = ceil(bits / 2)
   unsigned b = (bits + 1) / 2;

   // number of ulongs required to store each base-B digit
   unsigned w = CEIL_DIV (b, ULONG_BITS);
   ZNP_ASSERT (w <= 2);
   
   // limbs needed to store f1(B) and B^(n1-1) * f1(1/B), ditto for f2
   size_t k1 = CEIL_DIV (n1 * b, GMP_NUMB_BITS);
   size_t k2 = CEIL_DIV (n2 * b, GMP_NUMB_BITS);
   
   // allocate space
   ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 2 * (k1 + k2));
   mp_limb_t* v1 = limbs;       // k1 limbs
   mp_limb_t* v2 = v1 + k1;     // k2 limbs
   mp_limb_t* v3 = v2 + k2;     // k1 + k2 limbs

   ZNP_FASTALLOC (z, ulong, 6624, 2 * w * (n3 + 1));
   // "n" = normal order, "r" = reciprocal order
   ulong* zn = z;
   ulong* zr = z + w * (n3 + 1);

   if (!sqr)
   {
      // multiplication version

      // evaluate f1(B) and f2(B)
      zn_array_pack (v1, op1, n1, 1, b, 0, k1);
      zn_array_pack (v2, op2, n2, 1, b, 0, k2);

      // compute h(B) = f1(B) * f2(B)
      ZNP_mpn_mul (v3, v1, k1, v2, k2);
   }
   else
   {
      // squaring version

      // evaluate f1(B)
      zn_array_pack (v1, op1, n1, 1, b, 0, k1);

      // compute h(B) = f1(B)^2
      ZNP_mpn_mul (v3, v1, k1, v1, k1);
   }

   // decompose h(B) into base-B digits
   zn_array_unpack_SAFE (zn, v3, n3 + 1, b, 0, k1 + k2);

   if (!sqr)
   {
      // multiplication version

      // evaluate B^(n1-1) * f1(1/B) and B^(n2-1) * f2(1/B)
      zn_array_pack (v1, op1 + n1 - 1, n1, -1, b, 0, k1);
      zn_array_pack (v2, op2 + n2 - 1, n2, -1, b, 0, k2);

      // compute B^(n1+n2-2) * h(1/B) =
      //                     (B^(n1-1) * f1(1/B)) * (B^(n2-1) * f2(1/B))
      ZNP_mpn_mul (v3, v1, k1, v2, k2);
   }
   else
   {
      // squaring version

      // evaluate B^(n1-1) * f1(1/B)
      zn_array_pack (v1, op1 + n1 - 1, n1, -1, b, 0, k1);

      // compute B^(2*n1-2) * h(1/B) = (B^(n1-1) * f1(1/B))^2
      ZNP_mpn_mul (v3, v1, k1, v1, k1);
   }

   // decompose h(1/B) into base-B digits
   zn_array_unpack_SAFE (zr, v3, n3 + 1, b, 0, k1 + k2);

   // recover h(x) from h(B) and h(1/B)
   // (note: need to check that the high digit of each output coefficient
   // is < B - 1; this follows from an estimate in section 3.2 of [Har07].)
   zn_array_recover_reduce (res, 1, zn, zr, n3, b, redc, mod);
   
   ZNP_FASTFREE(z);
   ZNP_FASTFREE(limbs);
}
Beispiel #18
0
/*
   Multiplication/squaring using Kronecker substitution at 2^b and -2^b.
*/
void
zn_array_mul_KS2 (ulong* res,
                  const ulong* op1, size_t n1,
                  const ulong* op2, size_t n2,
                  int redc, const zn_mod_t mod)
{
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   ZNP_ASSERT (n1 <= ULONG_MAX);
   ZNP_ASSERT ((mod->m & 1) || !redc);
   
   if (n2 == 1)
   {
      // code below needs n2 > 1, so fall back on scalar multiplication
      _zn_array_scalar_mul (res, op1, n1, op2[0], redc, mod);
      return;
   }

   int sqr = (op1 == op2 && n1 == n2);

   // bits in each output coefficient
   unsigned bits = 2 * mod->bits + ceil_lg (n2);
   
   // we're evaluating at x = B and -B, where B = 2^b, and b = ceil(bits / 2)
   unsigned b = (bits + 1) / 2;

   // number of ulongs required to store each output coefficient
   unsigned w = CEIL_DIV (2 * b, ULONG_BITS);
   ZNP_ASSERT (w <= 3);

   // Write f1(x) = f1e(x^2) + x * f1o(x^2)
   //       f2(x) = f2e(x^2) + x * f2o(x^2)
   //        h(x) =  he(x^2) + x *  ho(x^2)
   // "e" = even, "o" = odd

   size_t n1o = n1 / 2;
   size_t n1e = n1 - n1o;

   size_t n2o = n2 / 2;
   size_t n2e = n2 - n2o;

   size_t n3 = n1 + n2 - 1;    // length of h
   size_t n3o = n3 / 2;
   size_t n3e = n3 - n3o;

   // f1(B) and |f1(-B)| are at most ((n1 - 1) * b + mod->bits) bits long.
   // However, when evaluating f1e(B^2) and B * f1o(B^2) the bitpacking
   // routine needs room for the last chunk of 2b bits. Therefore we need to
   // allow room for (n1 + 1) * b bits. Ditto for f2.
   size_t k1 = CEIL_DIV ((n1 + 1) * b, GMP_NUMB_BITS);
   size_t k2 = CEIL_DIV ((n2 + 1) * b, GMP_NUMB_BITS);
   size_t k3 = k1 + k2;

   // allocate space
   ZNP_FASTALLOC (limbs, mp_limb_t, 6624, 3 * k3);
   mp_limb_t* v1_buf0 = limbs;             // k1 limbs
   mp_limb_t* v2_buf0 = v1_buf0 + k1;      // k2 limbs
   mp_limb_t* v1_buf1 = v2_buf0 + k2;      // k1 limbs
   mp_limb_t* v2_buf1 = v1_buf1 + k1;      // k2 limbs
   mp_limb_t* v1_buf2 = v2_buf1 + k2;      // k1 limbs
   mp_limb_t* v2_buf2 = v1_buf2 + k1;      // k2 limbs

   // arrange overlapping buffers to minimise memory use
   // "p" = plus, "m" = minus
   mp_limb_t* v1e = v1_buf0;
   mp_limb_t* v2e = v2_buf0;
   mp_limb_t* v1o = v1_buf1;
   mp_limb_t* v2o = v2_buf1;
   mp_limb_t* v1p = v1_buf2;
   mp_limb_t* v2p = v2_buf2;
   mp_limb_t* v1m = v1_buf0;
   mp_limb_t* v2m = v2_buf0;
   mp_limb_t* v3m = v1_buf1;
   mp_limb_t* v3p = v1_buf0;
   mp_limb_t* v3e = v1_buf2;
   mp_limb_t* v3o = v1_buf0;
   
   ZNP_FASTALLOC (z, ulong, 6624, w * n3e);

   int v3m_neg;

   if (!sqr)
   {
      // multiplication version

      // evaluate f1e(B^2) and B * f1o(B^2)
      zn_array_pack (v1e, op1, n1e, 2, 2 * b, 0, k1);
      zn_array_pack (v1o, op1 + 1, n1o, 2, 2 * b, b, k1);

      // evaluate f2e(B^2) and B * f2o(B^2)
      zn_array_pack (v2e, op2, n2e, 2, 2 * b, 0, k2);
      zn_array_pack (v2o, op2 + 1, n2o, 2, 2 * b, b, k2);

      // compute f1(B) = f1e(B^2) + B * f1o(B^2)
      //     and f2(B) = f2e(B^2) + B * f2o(B^2)
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1p, v1e, v1o, k1));
      ZNP_ASSERT_NOCARRY (mpn_add_n (v2p, v2e, v2o, k2));

      // compute |f1(-B)| = |f1e(B^2) - B * f1o(B^2)|
      //     and |f2(-B)| = |f2e(B^2) - B * f2o(B^2)|
      v3m_neg  = signed_mpn_sub_n (v1m, v1e, v1o, k1);
      v3m_neg ^= signed_mpn_sub_n (v2m, v2e, v2o, k2);

      // compute  h(B)   =  f1(B)   *  f2(B)
      // compute |h(-B)| = |f1(-B)| * |f2(-B)|
      // v3m_neg is set if h(-B) is negative
      ZNP_mpn_mul (v3m, v1m, k1, v2m, k2);
      ZNP_mpn_mul (v3p, v1p, k1, v2p, k2);
   }
   else
   {
      // squaring version

      // evaluate f1e(B^2) and B * f1o(B^2)
      zn_array_pack (v1e, op1, n1e, 2, 2 * b, 0, k1);
      zn_array_pack (v1o, op1 + 1, n1o, 2, 2 * b, b, k1);

      // compute f1(B) = f1e(B^2) + B * f1o(B^2)
      ZNP_ASSERT_NOCARRY (mpn_add_n (v1p, v1e, v1o, k1));

      // compute |f1(-B)| = |f1e(B^2) - B * f1o(B^2)|
      signed_mpn_sub_n (v1m, v1e, v1o, k1);

      // compute h(B)  = f1(B)^2
      // compute h(-B) = f1(-B)^2
      // v3m_neg is cleared (since f1(-B)^2 is never negative)
      ZNP_mpn_mul (v3m, v1m, k1, v1m, k1);
      ZNP_mpn_mul (v3p, v1p, k1, v1p, k1);
      v3m_neg = 0;
   }
   
   // he(B^2) and B * ho(B^2) are both at most b * (n3 + 1) bits long (since
   // the coefficients don't overlap). The buffers used below are at least
   // b * (n1 + n2 + 2) = b * (n3 + 3) bits long. So we definitely have
   // enough room for 2 * he(B^2) and 2 * B * ho(B^2).

   // compute 2 * he(B^2) = h(B) + h(-B)
   ZNP_ASSERT_NOCARRY (v3m_neg ? mpn_sub_n (v3e, v3p, v3m, k3)
                               : mpn_add_n (v3e, v3p, v3m, k3));

   // unpack coefficients of he, and reduce mod m
   zn_array_unpack_SAFE (z, v3e, n3e, 2 * b, 1, k3);
   array_reduce (res, 2, z, n3e, w, redc, mod);
   
   // compute 2 * b * ho(B^2) = h(B) - h(-B)
   ZNP_ASSERT_NOCARRY (v3m_neg ? mpn_add_n (v3o, v3p, v3m, k3)
                               : mpn_sub_n (v3o, v3p, v3m, k3));
   
   // unpack coefficients of ho, and reduce mod m
   zn_array_unpack_SAFE (z, v3o, n3o, 2 * b, b + 1, k3);
   array_reduce (res + 1, 2, z, n3o, w, redc, mod);

   ZNP_FASTFREE (z);
   ZNP_FASTFREE (limbs);
}
Beispiel #19
0
void
virtual_pmfvec_ifft (virtual_pmfvec_t vec, ulong n, int fwd, ulong t)
{
   ZNP_ASSERT (vec->lgK <= vec->lgM + 1);
   ZNP_ASSERT (t * vec->K < 2 * vec->M);
   ZNP_ASSERT (n + fwd <= vec->K);

   if (vec->lgK == 0)
      return;
      
   vec->lgK--;
   vec->K >>= 1;
   
   const zn_mod_struct* mod = vec->mod;
   virtual_pmf_t* data = vec->data;
   ulong M = vec->M;
   ulong K = vec->K;
   ulong s, r = M >> vec->lgK;
   long i;

   if (n + fwd <= K)
   {
      for (i = K - 1; i >= (long) n; i--)
      {
         virtual_pmf_add (data[i], data[i + K]);
         virtual_pmf_divby2 (data[i]);
      }

      virtual_pmfvec_ifft (vec, n, fwd, t << 1);
      
      for (; i >= 0; i--)
      {
         virtual_pmf_add (data[i], data[i]);
         virtual_pmf_sub (data[i], data[i + K]);
      }
   }
   else
   {
      virtual_pmfvec_ifft (vec, K, 0, t << 1);

      for (i = K - 1, s = t + r * i; i >= (long)(n - K); i--, s -= r)
      {
         virtual_pmf_sub (data[i + K], data[i]);
         virtual_pmf_sub (data[i], data[i + K]);
         virtual_pmf_rotate (data[i + K], M + s);
      }

      vec->data += K;
      virtual_pmfvec_ifft (vec, n - K, fwd, t << 1);
      vec->data -= K;
      
      for (; i >= 0; i--, s -= r)
      {
         virtual_pmf_rotate (data[i + K], M - s);
         virtual_pmf_bfly (data[i + K], data[i]);
      }
   }

   vec->K <<= 1;
   vec->lgK++;
}
Beispiel #20
0
void
zn_array_mul_fft_dft (ulong* res,
                      const ulong* op1, size_t n1,
                      const ulong* op2, size_t n2,
                      unsigned lgT, const zn_mod_t mod)
{
   ZNP_ASSERT (mod->m & 1);
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);
   
   if (lgT == 0)
   {
      // no layers of DFT; just call usual FFT routine
      int sqr = (op1 == op2) && (n1 == n2);
      ulong x = zn_array_mul_fft_fudge (n1, n2, sqr, mod);
      zn_array_mul_fft (res, op1, n1, op2, n2, x, mod);
      return;
   }

   unsigned lgM, lgK;

   // number of pmf_t coefficients for each input poly
   ulong m1, m2;

   // figure out how big the transform needs to be
   mul_fft_params (&lgK, &lgM, &m1, &m2, n1, n2);

   // number of pmf_t coefficients for output poly
   ulong m = m1 + m2 - 1;

   ulong M = 1UL << lgM;
   ulong K = 1UL << lgK;
   ptrdiff_t skip = M + 1;

   size_t n3 = n1 + n2 - 1;

   // Split up transform into length K = U * T, i.e. U columns and T rows.
   if (lgT >= lgK)
      lgT = lgK;
   unsigned lgU = lgK - lgT;
   ulong U = 1UL << lgU;
   ulong T = 1UL << lgT;
   
   // space for two input rows, and one partial row
   pmfvec_t in1, in2, part;
   pmfvec_init (in1, lgU, skip, lgM, mod);
   pmfvec_init (in2, lgU, skip, lgM, mod);
   pmfvec_init (part, lgU, skip, lgM, mod);

   // the virtual pmfvec_t that we use for the column DFTs
   virtual_pmfvec_t col;
   virtual_pmfvec_init (col, lgT, lgM, mod);

   // zero the output
   zn_array_zero (res, n3);
   
   long i, j, k;
   int which;

   // Write m = U * mT + mU, where 0 <= mU < U
   ulong mU = m & (U - 1);
   ulong mT = m >> lgU;

   // for each row (beginning with the last partial row if it exists)....
   for (i = mT - (mU == 0); i >= 0; i--)
   {
      ulong i_rev = bit_reverse (i, lgT);
      
      // for each input array....
      for (which = 0; which < 2; which++)
      {
         pmfvec_struct* in = which ? in2 : in1;
         const ulong* op = which ? op2 : op1;
         size_t n = which ? n2 : n1;

         pmf_t p = in->data;

         for (j = 0; j < U; j++, p += in->skip)
         {
            // compute the i-th row of the j-th column as it would look after
            // the column FFTs, using naive DFT
            pmf_zero (p, M);
            ulong r = i_rev << (lgM - lgT + 1);
            
            for (k = 0; k < T; k++)
            {
               merge_chunk_to_pmf (p, op, n, (k * U + j) << (lgM - 1), M, mod);
               pmf_rotate (p, -r);
            }
            
            pmf_rotate (p, (i_rev * j) << (lgM - lgK + 1));
         }
         
         // Now we've got the whole row; run FFT on the row
         pmfvec_fft (in, (i == mT) ? mU : U, U, 0);
      }

      if (i == mT)
      {
         // pointwise multiply the two partial rows
         pmfvec_mul (part, in1, in2, mU, i == 0);
         // remove fudge factor
         pmfvec_scalar_mul (part, mU, pmfvec_mul_fudge (lgM, 0, mod));

         // zero remainder of the partial row; we will subsequently add
         // in contributions from the vertical IFFTs when we process the other
         // rows.
         for (j = mU; j < U; j++)
            pmf_zero (part->data + part->skip * j, M);
      }
      else
      {
         // pointwise multiply the two rows
         pmfvec_mul (in1, in1, in2, U, i == 0);
         // remove fudge factor
         pmfvec_scalar_mul (in1, U, pmfvec_mul_fudge (lgM, 0, mod));
         
         // horizontal IFFT this row
         pmfvec_ifft (in1, U, 0, U, 0);

         // simulate vertical IFFTs with DFTs
         for (j = 0; j < U; j++)
         {
            virtual_pmfvec_reset (col);
            virtual_pmf_import (col->data[i], in1->data + in1->skip * j);
            virtual_pmfvec_ifft (col, mT + (j < mU), (j >= mU) && mU,
                                 j << (lgM + 1 - lgK));
            
            if ((j >= mU) && mU)
            {
               // add contribution to partial row (only for rightmost columns)
               pmf_t src = virtual_pmf_export (col->data[mT]);
               if (src)
                  pmf_add (part->data + part->skip * j, src, M, mod);
            }

            // add contributions to output
            for (k = 0; k < mT + (j < mU); k++)
               merge_chunk_from_pmf (res, n3,
                                     virtual_pmf_export (col->data[k]),
                                     (k * U + j) * M/2, M, mod);
         }
      }
   }

   // now finish off the partial row
   if (mU)
   {
      // horizontal IFFT partial row
      pmfvec_ifft (part, mU, 0, U, 0);

      // simulate leftmost vertical IFFTs
      for (j = 0; j < mU; j++)
      {
         virtual_pmfvec_reset (col);
         virtual_pmf_import (col->data[mT], part->data + part->skip * j);
         virtual_pmfvec_ifft (col, mT + 1, 0, j << (lgM + 1 - lgK));
                         
         // add contributions to output
         for (k = 0; k <= mT; k++)
            merge_chunk_from_pmf (res, n3,
                                  virtual_pmf_export (col->data[k]),
                                  (k * U + j) * M/2, M, mod);
      }
   }
   
   // normalise result
   zn_array_scalar_mul (res, res, n3, zn_mod_pow2 (-lgK, mod), mod);

   virtual_pmfvec_clear (col);
   pmfvec_clear (part);
   pmfvec_clear (in2);
   pmfvec_clear (in1);
}
Beispiel #21
0
void zn_array_mul_fft (ulong* res,
                       const ulong* op1, size_t n1,
                       const ulong* op2, size_t n2,
                       ulong x, const zn_mod_t mod)
{
   ZNP_ASSERT (mod->m & 1);
   ZNP_ASSERT (n2 >= 1);
   ZNP_ASSERT (n1 >= n2);

   unsigned lgK, lgM;
   
   // number of pmf_t coefficients for each input poly
   ulong m1, m2;

   // figure out how big the transform needs to be
   mul_fft_params (&lgK, &lgM, &m1, &m2, n1, n2);
   
   // number of pmf_t coefficients for output poly
   ulong m3 = m1 + m2 - 1;

   ulong M = 1UL << lgM;
   ulong K = 1UL << lgK;
   ptrdiff_t skip = M + 1;
   
   pmfvec_t vec1, vec2;
   
   int sqr = (op1 == op2  &&  n1 == n2);

   if (!sqr)
   {
      // multiplying two distinct inputs

      // split inputs into pmf_t's and perform FFTs
      pmfvec_init (vec1, lgK, skip, lgM, mod);
      fft_split (vec1, op1, n1, 0, 1, 0);
      pmfvec_fft (vec1, m3, m1, 0);

      // note: we apply the fudge factor here, because the second input is
      // shorter than both the first input and the output :-)
      pmfvec_init (vec2, lgK, skip, lgM, mod);
      fft_split (vec2, op2, n2, 0, x, 0);
      pmfvec_fft (vec2, m3, m2, 0);

      // pointwise multiplication
      pmfvec_mul (vec1, vec1, vec2, m3, 1);

      pmfvec_clear (vec2);
   }
   else
   {
      // squaring a single input
   
      // split input into pmf_t's and perform FFTs
      pmfvec_init (vec1, lgK, skip, lgM, mod);
      fft_split (vec1, op1, n1, 0, 1, 0);
      pmfvec_fft (vec1, m3, m1, 0);

      // pointwise multiplication
      pmfvec_mul (vec1, vec1, vec1, m3, 1);
   }

   // inverse FFT, and write output
   pmfvec_ifft (vec1, m3, 0, m3, 0);
   size_t n3 = n1 + n2 - 1;
   fft_combine (res, n3, vec1, m3, 0);

   pmfvec_clear (vec1);
   
   // if we're squaring, then we haven't applied the fudge factor yet,
   // so do it now
   if (sqr)
      zn_array_scalar_mul_or_copy (res, res, n3, x, mod);
}