Beispiel #1
0
void
ecc_modp_sqr (const struct ecc_curve *ecc, mp_limb_t *rp,
	      const mp_limb_t *ap)
{
  mpn_sqr (rp, ap, ecc->size);
  ecc->reduce (ecc, rp);
}
Beispiel #2
0
int main()
{
  mp_limb_t *x = NULL, *y = NULL;
  long nx = 0;
#ifdef mpn_sqr
  mpn_sqr(y, x, nx);
#else
  mpn_mul_n(y, x, x, nx);
#endif
  return 0;
}
Beispiel #3
0
void Sqr(modp& ans,const modp& x,const Zp_Data& ZpD)
{ 
  if (ZpD.montgomery)
    { ZpD.Mont_Mult(ans.x,x.x,x.x); }
  else
    { //ans.x=(x.x*x.x)%ZpD.pr;
      mp_limb_t aa[2*MAX_MOD_SZ],q[2*MAX_MOD_SZ];
      mpn_sqr(aa,x.x,ZpD.t);
      mpn_tdiv_qr(q,ans.x,0,aa,2*ZpD.t,ZpD.prA,ZpD.t);     
    }
}
Beispiel #4
0
/* Input is {ap,rn}; output is {rp,rn}, computation is
   mod B^rn - 1, and values are semi-normalised; zero is represented
   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
   tp==rp is allowed. */
static void
mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
{
  mp_limb_t cy;

  ASSERT (0 < rn);

  mpn_sqr (tp, ap, rn);
  cy = mpn_add_n (rp, tp, tp + rn, rn);
  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
   * be no overflow when adding in the carry. */
  MPN_INCR_U (rp, rn, cy);
}
Beispiel #5
0
/* Input is {ap,rn+1}; output is {rp,rn+1}, in
   semi-normalised representation, computation is mod B^rn + 1. Needs
   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
   Output is normalised. */
static void
mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
{
  mp_limb_t cy;

  ASSERT (0 < rn);

  mpn_sqr (tp, ap, rn + 1);
  ASSERT (tp[2*rn+1] == 0);
  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
  rp[rn] = 0;
  MPN_INCR_U (rp, rn+1, cy );
}
Beispiel #6
0
void _fmpz_poly_sqrlow_KS(fmpz * res, const fmpz * poly, long len, long n)
{
    int neg;
    long bits, limbs, loglen, sign = 0;
    mp_limb_t *arr_in, *arr_out;

    FMPZ_VEC_NORM(poly, len);

    if (len == 0)
    {
        _fmpz_vec_zero(res, n);
        return;
    }

    neg = (fmpz_sgn(poly + len - 1) > 0) ? 0 : -1;

    if (n > 2 * len - 1)
    {
        _fmpz_vec_zero(res + 2 * len - 1, n - (2 * len - 1));
        n = 2 * len - 1;
    }

    bits = _fmpz_vec_max_bits(poly, len);
    if (bits < 0)
    {
        sign = 1;
        bits = - bits;
    }

    loglen = FLINT_BIT_COUNT(len);
    bits   = 2 * bits + loglen + sign;
    limbs  = (bits * len - 1) / FLINT_BITS + 1;

    arr_in  = flint_calloc(limbs, sizeof(mp_limb_t));
    arr_out = flint_malloc((2 * limbs) * sizeof(mp_limb_t));

    _fmpz_poly_bit_pack(arr_in, poly, len, bits, neg);

    mpn_sqr(arr_out, arr_in, limbs);

    if (sign)
        _fmpz_poly_bit_unpack(res, n, arr_out, bits, 0);
    else
        _fmpz_poly_bit_unpack_unsigned(res, n, arr_out, bits);

    flint_free(arr_in);
    flint_free(arr_out);
}
Beispiel #7
0
/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
 *
 * The result is expected to be ZERO if and only if the operand
 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
 * B^rn-1.
 * It should not be a problem if sqrmod_bnm1 is used to
 * compute the full square with an <= 2*rn, because this condition
 * implies (B^an-1)^2 < (B^rn-1) .
 *
 * Requires rn/4 < an <= rn
 * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
 *
 * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
 */
void
mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
{
  ASSERT (0 < an);
  ASSERT (an <= rn);

  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
    {
      if (UNLIKELY (an < rn))
	{
	  if (UNLIKELY (2*an <= rn))
	    {
	      mpn_sqr (rp, ap, an);
	    }
	  else
	    {
	      mp_limb_t cy;
	      mpn_sqr (tp, ap, an);
	      cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
	      MPN_INCR_U (rp, rn, cy);
	    }
	}
      else
	mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
    }
  else
    {
      mp_size_t n;
      mp_limb_t cy;
      mp_limb_t hi;

      n = rn >> 1;

      ASSERT (2*an > n);

      /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
	 and crt together as

	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
      */

#define a0 ap
#define a1 (ap + n)

#define xp  tp	/* 2n + 2 */
      /* am1  maybe in {xp, n} */
#define sp1 (tp + 2*n + 2)
      /* ap1  maybe in {sp1, n + 1} */

      {
	mp_srcptr am1;
	mp_size_t anm;
	mp_ptr so;

	if (LIKELY (an > n))
	  {
	    so = xp + n;
	    am1 = xp;
	    cy = mpn_add (xp, a0, n, a1, an - n);
	    MPN_INCR_U (xp, n, cy);
	    anm = n;
	  }
	else
	  {
	    so = xp;
	    am1 = a0;
	    anm = an;
	  }

	mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
      }

      {
	int       k;
	mp_srcptr ap1;
	mp_size_t anp;

	if (LIKELY (an > n)) {
	  ap1 = sp1;
	  cy = mpn_sub (sp1, a0, n, a1, an - n);
	  sp1[n] = 0;
	  MPN_INCR_U (sp1, n + 1, cy);
	  anp = n + ap1[n];
	} else {
	  ap1 = a0;
	  anp = an;
	}

	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
	  k=0;
	else
	  {
	    int mask;
	    k = mpn_fft_best_k (n, 1);
	    mask = (1<<k) -1;
	    while (n & mask) {k--; mask >>=1;};
	  }
	if (k >= FFT_FIRST_K)
	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
	else if (UNLIKELY (ap1 == a0))
	  {
	    ASSERT (anp <= n);
	    ASSERT (2*anp > n);
	    mpn_sqr (xp, a0, an);
	    anp = 2*an - n;
	    cy = mpn_sub (xp, xp, n, xp + n, anp);
	    xp[n] = 0;
	    MPN_INCR_U (xp, n+1, cy);
	  }
	else
	  mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
      }

      /* Here the CRT recomposition begins.

	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
	 Division by 2 is a bitwise rotation.

	 Assumes xp normalised mod (B^n+1).

	 The residue class [0] is represented by [B^n-1]; except when
	 both input are ZERO.
      */

#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
#if HAVE_NATIVE_mpn_rsh1add_nc
      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
      hi = cy << (GMP_NUMB_BITS - 1);
      cy = 0;
      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
	 overflows, i.e. a further increment will not overflow again. */
#else /* ! _nc */
      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
#endif
#if GMP_NAIL_BITS == 0
      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi);
#else
      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
      rp[n-1] ^= hi;
#endif
#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
#if HAVE_NATIVE_mpn_add_nc
      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
#else /* ! _nc */
      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
#endif
      cy += (rp[0]&1);
      mpn_rshift(rp, rp, n, 1);
      ASSERT (cy <= 2);
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* We can have cy != 0 only if hi = 0... */
      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
      rp[n-1] |= hi;
      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
#endif
      ASSERT (cy <= 1);
      /* Next increment can not overflow, read the previous comments about cy. */
      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
      MPN_INCR_U(rp, n, cy);

      /* Compute the highest half:
	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
       */
      if (UNLIKELY (2*an < rn))
	{
	  /* Note that in this case, the only way the result can equal
	     zero mod B^{rn} - 1 is if the input is zero, and
	     then the output of both the recursive calls and this CRT
	     reconstruction is zero, not B^{rn} - 1. */
	  cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);

	  /* FIXME: This subtraction of the high parts is not really
	     necessary, we do it to get the carry out, and for sanity
	     checking. */
	  cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
				   xp + 2*an - n, rn - 2*an, cy);
	  ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
	  cy = mpn_sub_1 (rp, rp, 2*an, cy);
	  ASSERT (cy == (xp + 2*an - n)[0]);
	}
      else
	{
	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
	     DECR will affect _at most_ the lowest n limbs. */
	  MPN_DECR_U (rp, 2*n, cy);
	}
#undef a0
#undef a1
#undef xp
#undef sp1
    }
}
Beispiel #8
0
void
mpz_mul (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
{
  mp_size_t usize;
  mp_size_t vsize;
  mp_size_t wsize;
  mp_size_t sign_product;
  mp_ptr up, vp;
  mp_ptr wp;
  mp_ptr free_me;
  size_t free_me_size;
  mp_limb_t cy_limb;
  TMP_DECL;

  usize = SIZ (u);
  vsize = SIZ (v);
  sign_product = usize ^ vsize;
  usize = ABS (usize);
  vsize = ABS (vsize);

  if (usize < vsize)
    {
      MPZ_SRCPTR_SWAP (u, v);
      MP_SIZE_T_SWAP (usize, vsize);
    }

  if (vsize == 0)
    {
      SIZ (w) = 0;
      return;
    }

#if HAVE_NATIVE_mpn_mul_2
  if (vsize <= 2)
    {
      wp = MPZ_REALLOC (w, usize+vsize);
      if (vsize == 1)
	cy_limb = mpn_mul_1 (wp, PTR (u), usize, PTR (v)[0]);
      else
	{
	  cy_limb = mpn_mul_2 (wp, PTR (u), usize, PTR (v));
	  usize++;
	}
      wp[usize] = cy_limb;
      usize += (cy_limb != 0);
      SIZ (w) = (sign_product >= 0 ? usize : -usize);
      return;
    }
#else
  if (vsize == 1)
    {
      wp = MPZ_REALLOC (w, usize+1);
      cy_limb = mpn_mul_1 (wp, PTR (u), usize, PTR (v)[0]);
      wp[usize] = cy_limb;
      usize += (cy_limb != 0);
      SIZ (w) = (sign_product >= 0 ? usize : -usize);
      return;
    }
#endif

  TMP_MARK;
  free_me = NULL;
  up = PTR (u);
  vp = PTR (v);
  wp = PTR (w);

  /* Ensure W has space enough to store the result.  */
  wsize = usize + vsize;
  if (ALLOC (w) < wsize)
    {
      if (wp == up || wp == vp)
	{
	  free_me = wp;
	  free_me_size = ALLOC (w);
	}
      else
	(*__gmp_free_func) (wp, (size_t) ALLOC (w) * GMP_LIMB_BYTES);

      ALLOC (w) = wsize;
      wp = __GMP_ALLOCATE_FUNC_LIMBS (wsize);
      PTR (w) = wp;
    }
  else
    {
      /* Make U and V not overlap with W.  */
      if (wp == up)
	{
	  /* W and U are identical.  Allocate temporary space for U.  */
	  up = TMP_ALLOC_LIMBS (usize);
	  /* Is V identical too?  Keep it identical with U.  */
	  if (wp == vp)
	    vp = up;
	  /* Copy to the temporary space.  */
	  MPN_COPY (up, wp, usize);
	}
      else if (wp == vp)
	{
	  /* W and V are identical.  Allocate temporary space for V.  */
	  vp = TMP_ALLOC_LIMBS (vsize);
	  /* Copy to the temporary space.  */
	  MPN_COPY (vp, wp, vsize);
	}
    }

  if (up == vp)
    {
      mpn_sqr (wp, up, usize);
      cy_limb = wp[wsize - 1];
    }
  else
    {
      cy_limb = mpn_mul (wp, up, usize, vp, vsize);
    }

  wsize -= cy_limb == 0;

  SIZ (w) = sign_product < 0 ? -wsize : wsize;
  if (free_me != NULL)
    (*__gmp_free_func) (free_me, free_me_size * GMP_LIMB_BYTES);
  TMP_FREE;
}
Beispiel #9
0
void
mpz_powm_ui (mpz_ptr r, mpz_srcptr b, unsigned long int el, mpz_srcptr m)
{
  mp_ptr xp, tp, qp, mp, bp;
  mp_size_t xn, tn, mn, bn;
  int m_zero_cnt;
  int c;
  mp_limb_t e;
  TMP_DECL;

  mp = PTR(m);
  mn = ABSIZ(m);
  if (mn == 0)
    DIVIDE_BY_ZERO;

  if (el == 0)
    {
      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
	 depending on if MOD equals 1.  */
      SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1;
      PTR(r)[0] = 1;
      return;
    }

  TMP_MARK;

  /* Normalize m (i.e. make its most significant bit set) as required by
     division functions below.  */
  count_leading_zeros (m_zero_cnt, mp[mn - 1]);
  m_zero_cnt -= GMP_NAIL_BITS;
  if (m_zero_cnt != 0)
    {
      mp_ptr new_mp = TMP_ALLOC_LIMBS (mn);
      mpn_lshift (new_mp, mp, mn, m_zero_cnt);
      mp = new_mp;
    }

  bn = ABSIZ(b);
  bp = PTR(b);
  if (bn > mn)
    {
      /* Reduce possibly huge base.  Use a function call to reduce, since we
	 don't want the quotient allocation to live until function return.  */
      mp_ptr new_bp = TMP_ALLOC_LIMBS (mn);
      reduce (new_bp, bp, bn, mp, mn);
      bp = new_bp;
      bn = mn;
      /* Canonicalize the base, since we are potentially going to multiply with
	 it quite a few times.  */
      MPN_NORMALIZE (bp, bn);
    }

  if (bn == 0)
    {
      SIZ(r) = 0;
      TMP_FREE;
      return;
    }

  tp = TMP_ALLOC_LIMBS (2 * mn + 1);
  xp = TMP_ALLOC_LIMBS (mn);

  qp = TMP_ALLOC_LIMBS (mn + 1);

  MPN_COPY (xp, bp, bn);
  xn = bn;

  e = el;
  count_leading_zeros (c, e);
  e = (e << c) << 1;		/* shift the exp bits to the left, lose msb */
  c = BITS_PER_MP_LIMB - 1 - c;

  /* Main loop. */

  /* If m is already normalized (high bit of high limb set), and b is the
     same size, but a bigger value, and e==1, then there's no modular
     reductions done and we can end up with a result out of range at the
     end. */
  if (c == 0)
    {
      if (xn == mn && mpn_cmp (xp, mp, mn) >= 0)
        mpn_sub_n (xp, xp, mp, mn);
      goto finishup;
    }

  while (c != 0)
    {
      mpn_sqr (tp, xp, xn);
      tn = 2 * xn; tn -= tp[tn - 1] == 0;
      if (tn < mn)
	{
	  MPN_COPY (xp, tp, tn);
	  xn = tn;
	}
      else
	{
	  mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn);
	  xn = mn;
	}

      if ((mp_limb_signed_t) e < 0)
	{
	  mpn_mul (tp, xp, xn, bp, bn);
	  tn = xn + bn; tn -= tp[tn - 1] == 0;
	  if (tn < mn)
	    {
	      MPN_COPY (xp, tp, tn);
	      xn = tn;
	    }
	  else
	    {
	      mpn_tdiv_qr (qp, xp, 0L, tp, tn, mp, mn);
	      xn = mn;
	    }
	}
      e <<= 1;
      c--;
    }

 finishup:
  /* We shifted m left m_zero_cnt steps.  Adjust the result by reducing
     it with the original MOD.  */
  if (m_zero_cnt != 0)
    {
      mp_limb_t cy;
      cy = mpn_lshift (tp, xp, xn, m_zero_cnt);
      tp[xn] = cy; xn += cy != 0;

      if (xn < mn)
	{
	  MPN_COPY (xp, tp, xn);
	}
      else
	{
	  mpn_tdiv_qr (qp, xp, 0L, tp, xn, mp, mn);
	  xn = mn;
	}
      mpn_rshift (xp, xp, xn, m_zero_cnt);
    }
  MPN_NORMALIZE (xp, xn);

  if ((el & 1) != 0 && SIZ(b) < 0 && xn != 0)
    {
      mp = PTR(m);			/* want original, unnormalized m */
      mpn_sub (xp, mp, mn, xp, xn);
      xn = mn;
      MPN_NORMALIZE (xp, xn);
    }
  MPZ_REALLOC (r, xn);
  SIZ (r) = xn;
  MPN_COPY (PTR(r), xp, xn);

  TMP_FREE;
}
void _arb_sin_cos_taylor_rs(mp_ptr ysin, mp_ptr ycos,
                            mp_limb_t * error, mp_srcptr x, mp_size_t xn, ulong N,
                            int sinonly, int alternating)
{
    mp_ptr s, t, xpow;
    mp_limb_t new_denom, old_denom, c;
    slong power, k, m;
    int cosorsin;

    TMP_INIT;
    TMP_START;

    if (2 * N >= FACTORIAL_TAB_SIZE - 1)
    {
        flint_printf("_arb_sin_cos_taylor_rs: N too large!\n");
        abort();
    }

    if (N <= 1)
    {
        if (N == 0)
        {
            flint_mpn_zero(ysin, xn);
            if (!sinonly) flint_mpn_zero(ycos, xn);
            error[0] = 0;
        }
        else if (N == 1)
        {
            flint_mpn_copyi(ysin, x, xn);
            if (!sinonly) flint_mpn_store(ycos, xn, LIMB_ONES);
            error[0] = 1;
        }
    }
    else
    {
        /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */
        m = 2;
        while (m * m < N)
            m += 2;

        /* todo: merge allocations */
        xpow = TMP_ALLOC_LIMBS((m + 1) * xn);
        s = TMP_ALLOC_LIMBS(xn + 2);
        t = TMP_ALLOC_LIMBS(2 * xn + 2);     /* todo: 1 limb too much? */

        /* higher index ---> */
        /*        | ---xn--- | */
        /* xpow = |  <temp>  | x^m | x^(m-1) | ... | x^2 | x | */

#define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn)
#define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn)

        mpn_sqr(XPOW_WRITE(1), x, xn);
        mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn);

        for (k = 4; k <= m; k += 2)
        {
            mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn);
            mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn);
        }

        for (cosorsin = sinonly; cosorsin < 2; cosorsin++)
        {
            flint_mpn_zero(s, xn + 1);

            /* todo: skip one nonscalar multiplication (use x^m)
               when starting on x^0 */
            power = (N - 1) % m;

            for (k = N - 1; k >= 0; k--)
            {
                c = factorial_tab_numer[2 * k + cosorsin];
                new_denom = factorial_tab_denom[2 * k + cosorsin];
                old_denom = factorial_tab_denom[2 * k + cosorsin + 2];

                /* change denominators */
                if (new_denom != old_denom && k < N - 1)
                {
                    if (alternating && (k % 2 == 0))
                        s[xn] += old_denom;

                    mpn_divrem_1(s, 0, s, xn + 1, old_denom);

                    if (alternating && (k % 2 == 0))
                        s[xn] -= 1;
                }

                if (power == 0)
                {
                    /* add c * x^0 -- only top limb is affected */
                    if (alternating & k)
                        s[xn] -= c;
                    else
                        s[xn] += c;

                    /* Outer polynomial evaluation: multiply by x^m */
                    if (k != 0)
                    {
                        mpn_mul(t, s, xn + 1, XPOW_READ(m), xn);
                        flint_mpn_copyi(s, t + xn, xn + 1);
                    }

                    power = m - 1;
                }
                else
                {
                    if (alternating & k)
                        s[xn] -= mpn_submul_1(s, XPOW_READ(power), xn, c);
                    else
                        s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c);

                    power--;
                }
            }

            /* finally divide by denominator */
            if (cosorsin == 0)
            {
                mpn_divrem_1(t, 0, s, xn + 1, factorial_tab_denom[0]);

                /* perturb down to a number < 1 if necessary. note that this
                   does not invalidate the error bound: 1 - ulp is either
                   1 ulp too small or must be closer to the exact value */
                if (t[xn] == 0)
                    flint_mpn_copyi(ycos, t, xn);
                else
                    flint_mpn_store(ycos, xn, LIMB_ONES);
            }
            else
            {
                mpn_divrem_1(s, 0, s, xn + 1, factorial_tab_denom[0]);
                mpn_mul(t, s, xn + 1, x, xn);
                flint_mpn_copyi(ysin, t + xn, xn);
            }
        }

        /* error bound (ulp) */
        error[0] = 2;
    }

    TMP_END;
}
Beispiel #11
0
mp_size_t
mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
{
  mp_limb_t x;
  int cnt, i;
  mp_size_t rn;
  int par;

  ASSERT (bn >= 1);
  /* FIXME: Add operand overlap criteria */

  if (exp <= 1)
    {
      if (exp == 0)
	{
	  rp[0] = 1;
	  return 1;
	}
      else
	{
	  MPN_COPY (rp, bp, bn);
	  return bn;
	}
    }

  /* Count number of bits in exp, and compute where to put initial square in
     order to magically get results in the entry rp.  Use simple code,
     optimized for small exp.  For large exp, the bignum operations will take
     so much time that the slowness of this code will be negligible.  */
  par = 0;
  cnt = GMP_LIMB_BITS;
  for (x = exp; x != 0; x >>= 1)
    {
      par ^= x & 1;
      cnt--;
    }
  exp <<= cnt;

  if (bn == 1)
    {
      mp_limb_t bl = bp[0];

      if ((cnt & 1) != 0)
	MP_PTR_SWAP (rp, tp);

      mpn_sqr (rp, bp, bn);
      rn = 2 * bn; rn -= rp[rn - 1] == 0;

      for (i = GMP_LIMB_BITS - cnt - 1;;)
	{
	  exp <<= 1;
	  if ((exp & GMP_LIMB_HIGHBIT) != 0)
	    {
	      rp[rn] = mpn_mul_1 (rp, rp, rn, bl);
	      rn += rp[rn] != 0;
	    }

	  if (--i == 0)
	    break;

	  mpn_sqr (tp, rp, rn);
	  rn = 2 * rn; rn -= tp[rn - 1] == 0;
	  MP_PTR_SWAP (rp, tp);
	}
    }
  else
    {
      if (((par ^ cnt) & 1) == 0)
	MP_PTR_SWAP (rp, tp);

      mpn_sqr (rp, bp, bn);
      rn = 2 * bn; rn -= rp[rn - 1] == 0;

      for (i = GMP_LIMB_BITS - cnt - 1;;)
	{
	  exp <<= 1;
	  if ((exp & GMP_LIMB_HIGHBIT) != 0)
	    {
	      rn = rn + bn - (mpn_mul (tp, rp, rn, bp, bn) == 0);
	      MP_PTR_SWAP (rp, tp);
	    }

	  if (--i == 0)
	    break;

	  mpn_sqr (tp, rp, rn);
	  rn = 2 * rn; rn -= tp[rn - 1] == 0;
	  MP_PTR_SWAP (rp, tp);
	}
    }

  return rn;
}
Beispiel #12
0
mp_limb_t
mpn_mul (mp_ptr prodp,
	 mp_srcptr up, mp_size_t un,
	 mp_srcptr vp, mp_size_t vn)
{
  mp_size_t l, k;
  mp_limb_t c;

  ASSERT (un >= vn);
  ASSERT (vn >= 1);
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));

  if (un == vn)
   {
    if (up == vp)
    {
      mpn_sqr (prodp, up, un);
      return prodp[2 * un - 1];
    }
    else
    {
      mpn_mul_n (prodp, up, vp, un);
      return prodp[2 * un - 1];
    }
   }

  if (vn < MUL_KARATSUBA_THRESHOLD)
    { /* plain schoolbook multiplication */
      if (un <= MUL_BASECASE_MAX_UN)
	mpn_mul_basecase (prodp, up, un, vp, vn);
      else
	{
	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
	     these pieces with the vp[] operand.  After each such partial
	     multiplication (but the last) we copy the most significant vn
	     limbs into a temporary buffer since that part would otherwise be
	     overwritten by the next multiplication.  After the next
	     multiplication, we add it back.  This illustrates the situation:

                                                    -->vn<--
                                                      |  |<------- un ------->|
                                                         _____________________|
                                                        X                    /|
                                                      /XX__________________/  |
                                    _____________________                     |
                                   X                    /                     |
                                 /XX__________________/                       |
               _____________________                                          |
              /                    /                                          |
            /____________________/                                            |
	    ==================================================================

	    The parts marked with X are the parts whose sums are copied into
	    the temporary buffer.  */

	  mp_limb_t tp[MUL_KARATSUBA_THRESHOLD_LIMIT];
	  mp_limb_t cy;
          ASSERT (MUL_KARATSUBA_THRESHOLD <= MUL_KARATSUBA_THRESHOLD_LIMIT);

	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	  prodp += MUL_BASECASE_MAX_UN;
	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	  up += MUL_BASECASE_MAX_UN;
	  un -= MUL_BASECASE_MAX_UN;
	  while (un > MUL_BASECASE_MAX_UN)
	    {
	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	      mpn_incr_u (prodp + vn, cy);		/* safe? */
	      prodp += MUL_BASECASE_MAX_UN;
	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
	      up += MUL_BASECASE_MAX_UN;
	      un -= MUL_BASECASE_MAX_UN;
	    }
	  if (un > vn)
	    {
	      mpn_mul_basecase (prodp, up, un, vp, vn);
	    }
	  else
	    {
	      ASSERT_ALWAYS (un > 0);
	      mpn_mul_basecase (prodp, vp, vn, up, un);
	    }
	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
	  mpn_incr_u (prodp + vn, cy);		/* safe? */
	}
      return prodp[un + vn - 1];
  }

  if (ABOVE_THRESHOLD (un + vn, 2*MUL_FFT_FULL_THRESHOLD)
      && ABOVE_THRESHOLD (3*vn, MUL_FFT_FULL_THRESHOLD))
    {
      mpn_mul_fft_main (prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
    }

  k = (un + 3)/4; // ceil(un/4)

#if GMP_NUMB_BITS == 32
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (5*un <= 11*vn))
#else
  if ((ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM8H_THRESHOLD)) && (vn>=86) && (4*un <= 13*vn))
#endif
  {
      mpn_toom8h_mul(prodp, up, un, vp, vn);
      return prodp[un + vn - 1];
  }
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD))
  {
          if (vn > 3*k)
          {
             mpn_toom4_mul(prodp, up, un, vp, vn);
             return prodp[un + vn - 1];
          } else
          {
             l = (un + 4)/5; // ceil(un/5)
             if ((((vn > 9*k/4) && (un+vn <= 6*MUL_TOOM4_THRESHOLD)) 
                 || ((vn > 2*l) && (un+vn > 6*MUL_TOOM4_THRESHOLD)))
                 && (vn <= 3*l))
             {
                mpn_toom53_mul(prodp, up, un, vp, vn);
                return prodp[un + vn - 1];
             }
          }
  } 
  
  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
  {
          mp_ptr ws;
          TMP_DECL;
          TMP_MARK;

          if (vn < 2*k) // un/2 >= vn > un/4
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom42_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }

          l = (un+2)/3; //ceil(u/3)
          if (vn > 2*l) // un >= vn > 2un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom3_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          } else // 2un/3 >= vn > un/3
          {
                  ws = TMP_ALLOC_LIMBS (MPN_TOOM3_MUL_TSIZE(un));
                  mpn_toom32_mul(prodp, up, un, vp, vn, ws);
                  TMP_FREE;
                  return prodp[un + vn - 1];
          }
  }

  mpn_mul_n (prodp, up, vp, vn);

  if (un != vn)
    { mp_limb_t t;
      mp_ptr ws;
      TMP_DECL;
      TMP_MARK;

      prodp += vn;
      l = vn;
      up += vn;
      un -= vn;

      if (un < vn)
	{
	  /* Swap u's and v's. */
	  MPN_SRCPTR_SWAP (up,un, vp,vn);
	}

      ws = TMP_ALLOC_LIMBS ((vn >= MUL_KARATSUBA_THRESHOLD ? vn : un) + vn);

      t = 0;
      while (vn >= MUL_KARATSUBA_THRESHOLD)
	{
	  mpn_mul_n (ws, up, vp, vn);
	  if (l <= 2*vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != 2*vn)
		{
		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
		  l = 2*vn;
		}
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
	    }
	  prodp += vn;
	  l -= vn;
	  up += vn;
	  un -= vn;
	  if (un < vn)
	    {
	      /* Swap u's and v's. */
	      MPN_SRCPTR_SWAP (up,un, vp,vn);
	    }
		}

      if (vn != 0)
	{
	  mpn_mul_basecase (ws, up, un, vp, vn);
	  if (l <= un + vn)
	    {
	      t += mpn_add_n (prodp, prodp, ws, l);
	      if (l != un + vn)
		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
	    }
	  else
	    {
	      c = mpn_add_n (prodp, prodp, ws, un + vn);
	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
	    }
	}

      TMP_FREE;
  }

  return prodp[un + vn - 1];
}
Beispiel #13
0
mp_size_t mpn_remove_power_ascending(mp_ptr x, mp_size_t xsize,
                                      mp_ptr p, mp_size_t psize, ulong *exp)
{
    int i, maxi;
    mp_ptr div;
    mp_ptr rem;
    mp_ptr square[FLINT_BITS];
    mp_size_t square_size[FLINT_BITS];
    mp_size_t sqsize;

    *exp = 0;

    if (psize > xsize)
        return xsize;

    maxi = 0;
    square[0] = p;
    square_size[0] = psize;

    /* Most likely less memory will be needed, but this way we
       avoid reallocations */
    div = flint_malloc(sizeof(mp_limb_t) * xsize);
    rem = flint_malloc(sizeof(mp_limb_t) * xsize);

    /* Remove ascending powers */
    for (i = 0; i < FLINT_BITS && xsize >= square_size[i]; i++)
    {
        mpn_tdiv_qr(div, rem, 0, x, xsize, square[i], square_size[i]);
        if (!mpn_zero_p(rem, square_size[i]))
        {
            i -= 1;
            break;
        }

        *exp += (1 << i);
        xsize = xsize - square_size[i] + 1;
        if (div[xsize-1] == 0)
            xsize--;
        mpn_copyi(x, div, xsize);

        /* Form next square if needed */
        sqsize = square_size[i] * 2;
        if (sqsize - 1 > xsize)
            break;
        maxi = i + 1;
        square[i + 1] = flint_malloc(sizeof(mp_limb_t) * sqsize);
        mpn_sqr(square[i + 1], square[i], square_size[i]);
        if (square[i + 1][sqsize - 1] == 0)
            sqsize -= 1;
        square_size[i + 1] = sqsize;
   }

    /* Remove descending powers */
    for ( ; i >= 0; i--)
    {
        if (xsize >= square_size[i])
        {
            mpn_tdiv_qr(div, rem, 0, x, xsize, square[i], square_size[i]);
            if (mpn_zero_p(rem, square_size[i]))
            {
                *exp += (1 << i);
                xsize = xsize - square_size[i] + 1;
                if (div[xsize-1] == 0)
                    xsize--;
                mpn_copyi(x, div, xsize);
            }
        }
    }

    for (i = 1; i <= maxi; i++)
        flint_free(square[i]);
    flint_free(div);
    flint_free(rem);
    return xsize;
}
Beispiel #14
0
mp_bitcnt_t
mpn_remove (mp_ptr wp, mp_size_t *wn,
	    mp_ptr up, mp_size_t un, mp_ptr vp, mp_size_t vn,
	    mp_bitcnt_t cap)
{
  mp_ptr    pwpsp[LOG];
  mp_size_t pwpsn[LOG];
  mp_size_t npowers;
  mp_ptr tp, qp, np, pp, qp2;
  mp_size_t pn, nn, qn, i;
  mp_bitcnt_t pwr;
  TMP_DECL;

  ASSERT (un > 0);
  ASSERT (vn > 0);
  ASSERT (vp[0] % 2 != 0);	/* 2-adic division wants odd numbers */
  ASSERT (vn > 1 || vp[0] > 1);	/* else we would loop indefinitely */

  TMP_MARK;

  tp = TMP_ALLOC_LIMBS ((un + 1 + vn) / 2); /* remainder */
  qp = TMP_ALLOC_LIMBS (un + 1);	/* quotient, alternating */
  qp2 = TMP_ALLOC_LIMBS (un + 1);	/* quotient, alternating */
  np = TMP_ALLOC_LIMBS (un + LOG);	/* powers of V */
  pp = vp;
  pn = vn;

  MPN_COPY (qp, up, un);
  qn = un;

  npowers = 0;
  while (qn >= pn)
    {
      qp[qn] = 0;
      mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn);
      if (!mpn_zero_p (tp, pn))
	break;			/* could not divide by V^npowers */

      MP_PTR_SWAP (qp, qp2);
      qn = qn - pn;
      qn += qp[qn] != 0;

      pwpsp[npowers] = pp;
      pwpsn[npowers] = pn;
      npowers++;

      if (((mp_bitcnt_t) 2 << npowers) - 1 > cap)
	break;

      nn = 2 * pn - 1;		/* next power will be at least this large */
      if (nn > qn)
	break;			/* next power would be overlarge */

      mpn_sqr (np, pp, pn);
      nn += np[nn] != 0;
      pp = np;
      pn = nn;
      np += nn;
    }

  pwr = ((mp_bitcnt_t) 1 << npowers) - 1;

  for (i = npowers - 1; i >= 0; i--)
    {
      pp = pwpsp[i];
      pn = pwpsn[i];
      if (qn < pn)
	continue;

      if (pwr + ((mp_bitcnt_t) 1 << i) > cap)
	continue;		/* V^i would bring us past cap */

      qp[qn] = 0;
      mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn);
      if (!mpn_zero_p (tp, pn))
	continue;		/* could not divide by V^i */

      MP_PTR_SWAP (qp, qp2);
      qn = qn - pn;
      qn += qp[qn] != 0;

      pwr += (mp_bitcnt_t) 1 << i;
    }

  MPN_COPY (wp, qp, qn);
  *wn = qn;

  TMP_FREE;

  return pwr;
}
Beispiel #15
0
void
mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m)
{
  mp_ptr xp, tp, qp, gp, this_gp;
  mp_srcptr bp, ep, mp;
  mp_size_t bn, es, en, mn, xn;
  mp_limb_t invm, c;
  unsigned long int enb;
  mp_size_t i, K, j, l, k;
  int m_zero_cnt, e_zero_cnt;
  int sh;
  int use_redc;
#if HANDLE_NEGATIVE_EXPONENT
  mpz_t new_b;
#endif
#if REDUCE_EXPONENT
  mpz_t new_e;
#endif
  TMP_DECL;

  mp = PTR(m);
  mn = ABSIZ (m);
  if (mn == 0)
    DIVIDE_BY_ZERO;

  TMP_MARK;

  es = SIZ (e);
  if (es <= 0)
    {
      if (es == 0)
	{
	  /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if
	     m equals 1.  */
	  SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1;
	  PTR(r)[0] = 1;
	  TMP_FREE;	/* we haven't really allocated anything here */
	  return;
	}
#if HANDLE_NEGATIVE_EXPONENT
      MPZ_TMP_INIT (new_b, mn + 1);

      if (! mpz_invert (new_b, b, m))
	DIVIDE_BY_ZERO;
      b = new_b;
      es = -es;
#else
      DIVIDE_BY_ZERO;
#endif
    }
  en = es;

#if REDUCE_EXPONENT
  /* Reduce exponent by dividing it by phi(m) when m small.  */
  if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150)
    {
      MPZ_TMP_INIT (new_e, 2);
      mpz_mod_ui (new_e, e, phi (mp[0]));
      e = new_e;
    }
#endif

  use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0;
  if (use_redc)
    {
      /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */
      modlimb_invert (invm, mp[0]);
      invm = -invm;
    }
  else
    {
      /* Normalize m (i.e. make its most significant bit set) as required by
	 division functions below.  */
      count_leading_zeros (m_zero_cnt, mp[mn - 1]);
      m_zero_cnt -= GMP_NAIL_BITS;
      if (m_zero_cnt != 0)
	{
	  mp_ptr new_mp;
	  new_mp = TMP_ALLOC_LIMBS (mn);
	  mpn_lshift (new_mp, mp, mn, m_zero_cnt);
	  mp = new_mp;
	}
    }

  /* Determine optimal value of k, the number of exponent bits we look at
     at a time.  */
  count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]);
  e_zero_cnt -= GMP_NAIL_BITS;
  enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */
  k = 1;
  K = 2;
  while (2 * enb > K * (2 + k * (3 + k)))
    {
      k++;
      K *= 2;
      if (k == 10)			/* cap allocation */
	break;
    }

  tp = TMP_ALLOC_LIMBS (2 * mn);
  qp = TMP_ALLOC_LIMBS (mn + 1);

  gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn);

  /* Compute x*R^n where R=2^BITS_PER_MP_LIMB.  */
  bn = ABSIZ (b);
  bp = PTR(b);
  /* Handle |b| >= m by computing b mod m.  FIXME: It is not strictly necessary
     for speed or correctness to do this when b and m have the same number of
     limbs, perhaps remove mpn_cmp call.  */
  if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0))
    {
      /* Reduce possibly huge base while moving it to gp[0].  Use a function
	 call to reduce, since we don't want the quotient allocation to
	 live until function return.  */
      if (use_redc)
	{
	  reduce (tp + mn, bp, bn, mp, mn);	/* b mod m */
	  MPN_ZERO (tp, mn);
	  mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */
	}
      else
	{
	  reduce (gp, bp, bn, mp, mn);
	}
    }
  else
    {
      /* |b| < m.  We pad out operands to become mn limbs,  which simplifies
	 the rest of the function, but slows things down when the |b| << m.  */
      if (use_redc)
	{
	  MPN_ZERO (tp, mn);
	  MPN_COPY (tp + mn, bp, bn);
	  MPN_ZERO (tp + mn + bn, mn - bn);
	  mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn);
	}
      else
	{
	  MPN_COPY (gp, bp, bn);
	  MPN_ZERO (gp + bn, mn - bn);
	}
    }

  /* Compute xx^i for odd g < 2^i.  */

  xp = TMP_ALLOC_LIMBS (mn);
  mpn_sqr (tp, gp, mn);
  if (use_redc)
    mpn_redc_1 (xp, tp, mp, mn, invm);		/* xx = x^2*R^n */
  else
    mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
  this_gp = gp;
  for (i = 1; i < K / 2; i++)
    {
      mpn_mul_n (tp, this_gp, xp, mn);
      this_gp += mn;
      if (use_redc)
	mpn_redc_1 (this_gp,tp, mp, mn, invm);	/* g[i] = x^(2i+1)*R^n */
      else
	mpn_tdiv_qr (qp, this_gp, 0L, tp, 2 * mn, mp, mn);
    }

  /* Start the real stuff.  */
  ep = PTR (e);
  i = en - 1;				/* current index */
  c = ep[i];				/* current limb */
  sh = GMP_NUMB_BITS - e_zero_cnt;	/* significant bits in ep[i] */
  sh -= k;				/* index of lower bit of ep[i] to take into account */
  if (sh < 0)
    {					/* k-sh extra bits are needed */
      if (i > 0)
	{
	  i--;
	  c <<= (-sh);
	  sh += GMP_NUMB_BITS;
	  c |= ep[i] >> sh;
	}
    }
Beispiel #16
0
/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd.

   Iterates

     r' <-- r - r * (a^{k-1} r^k - 1) / n

   If

     a^{k-1} r^k = 1 (mod 2^m),

   then

     a^{k-1} r'^k = 1 (mod 2^{2m}),

   Compute the update term as

     r' = r - (a^{k-1} r^{k+1} - r) / k

   where we still have cancellation of low limbs.

 */
void
mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
{
  mp_size_t sizes[GMP_LIMB_BITS * 2];
  mp_ptr akm1, tp, rnp, ep;
  mp_limb_t a0, r0, km1, kp1h, kinv;
  mp_size_t rn;
  unsigned i;

  TMP_DECL;

  ASSERT (n > 0);
  ASSERT (ap[0] & 1);
  ASSERT (k & 1);
  ASSERT (k >= 3);

  TMP_MARK;

  akm1 = TMP_ALLOC_LIMBS (4*n);
  tp = akm1 + n;

  km1 = k-1;
  /* FIXME: Could arrange the iteration so we don't need to compute
     this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note
     that we can use wraparound also for a*r, since the low half is
     unchanged from the previous iteration. Or possibly mulmid. Also,
     a r = a^{1/k}, so we get that value too, for free? */
  mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */

  a0 = ap[0];
  binvert_limb (kinv, k);

  /* 4 bits: a^{1/k - 1} (mod 16):

	a % 8
	1 3 5 7
   k%4 +-------
     1 |1 1 1 1
     3 |1 9 9 1
  */
  r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8);
  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */
  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */
  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */
#if GMP_NUMB_BITS > 32
  {
    unsigned prec = 32;
    do
      {
	r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k));
	prec *= 2;
      }
    while (prec < GMP_NUMB_BITS);
  }
#endif

  rp[0] = r0;
  if (n == 1)
    {
      TMP_FREE;
      return;
    }

  /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */
  kp1h = k/2 + 1;

  /* FIXME: Special case for two limb iteration. */
  rnp = TMP_ALLOC_LIMBS (2*n + 1);
  ep = rnp + n;

  /* FIXME: Possible to this on the fly with some bit fiddling. */
  for (i = 0; n > 1; n = (n + 1)/2)
    sizes[i++] = n;

  rn = 1;

  while (i-- > 0)
    {
      /* Compute x^{k+1}. */
      mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the
			       final iteration. */
      mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp);

      /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */

      mpn_mullo_n (ep, rnp, akm1, sizes[i]);
      ASSERT (mpn_cmp (ep, rp, rn) == 0);

      ASSERT (sizes[i] <= 2*rn);
      mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0);
      mpn_neg (rp + rn, rp + rn, sizes[i] - rn);
      rn = sizes[i];
    }
  TMP_FREE;
}
/*
   ret + (xp, n) = (yp, n)*(zp, n) % 2^b + 1  
   needs (tp, 2n) temp space, everything reduced mod 2^b 
   inputs, outputs are fully reduced
  
   N.B: 2n is not the same as 2b rounded up to nearest limb!
*/
inline static int
mpn_mulmod_2expp1_internal (mp_ptr xp, mp_srcptr yp, mp_srcptr zp,
			                                         mpir_ui b, mp_ptr tp)
{
  mp_size_t n, k;
  mp_limb_t c;

  TMP_DECL;

  n = BITS_TO_LIMBS (b);
  k = GMP_NUMB_BITS * n - b;

  ASSERT(b > 0);
  ASSERT(n > 0);
  ASSERT_MPN(yp, n);
  ASSERT_MPN(zp, n);
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, yp, n));
  ASSERT(!MPN_OVERLAP_P (tp, 2 * n, zp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp, n));
  ASSERT(MPN_SAME_OR_SEPARATE_P (xp, tp + n, n));
  ASSERT(k == 0 || yp[n - 1] >> (GMP_NUMB_BITS - k) == 0);
  ASSERT(k == 0 || zp[n - 1] >> (GMP_NUMB_BITS - k) == 0);

#ifndef TUNE_PROGRAM_BUILD
  if (k == 0 && n > FFT_MULMOD_2EXPP1_CUTOFF && n == mpir_fft_adjust_limbs(n))
  {
      mp_bitcnt_t depth1, depth = 1;
      mp_size_t w1, off;
      mp_ptr tx, ty, tz;
      mp_limb_t ret;

      TMP_MARK;

      tx = TMP_BALLOC_LIMBS(3*n + 3);
      ty = tx + n + 1;
      tz = ty + n + 1;

      MPN_COPY(ty, yp, n);
      MPN_COPY(tz, zp, n);
      ty[n] = 0;
      tz[n] = 0;

      while ((((mp_limb_t)1)<<depth) < b) depth++;
   
      if (depth < 12) off = mulmod_2expp1_table_n[0];
      else off = mulmod_2expp1_table_n[MIN(depth, FFT_N_NUM + 11) - 12];
      depth1 = depth/2 - off;
   
      w1 = b/(((mp_limb_t)1)<<(2*depth1));

      mpir_fft_mulmod_2expp1(tx, ty, tz, n, depth1, w1);

      MPN_COPY(xp, tx, n);
      ret = tx[n];
      
      TMP_FREE;

	   return ret;
  }
#endif

  if (yp == zp)
     mpn_sqr(tp, yp, n);
  else
     mpn_mul_n (tp, yp, zp, n);

  if (k == 0)
    {
      c = mpn_sub_n (xp, tp, tp + n, n);

      return mpn_add_1 (xp, xp, n, c);
    }

  c = tp[n - 1];
  tp[n - 1] &= GMP_NUMB_MASK >> k;

#if HAVE_NATIVE_mpn_sublsh_nc
  c = mpn_sublsh_nc (xp, tp, tp + n, n, k, c);
#else
  {
    mp_limb_t c1;
    c1 = mpn_lshift (tp + n, tp + n, n, k);
    tp[n] |= c >> (GMP_NUMB_BITS - k);
    c = mpn_sub_n (xp, tp, tp + n, n) + c1;
  }
#endif

  c = mpn_add_1 (xp, xp, n, c);
  xp[n - 1] &= GMP_NUMB_MASK >> k;

  return c;
}
Beispiel #18
0
void _arb_exp_taylor_rs(mp_ptr y, mp_limb_t * error,
    mp_srcptr x, mp_size_t xn, ulong N)
{
    mp_ptr s, t, xpow;
    mp_limb_t new_denom, old_denom, c;
    slong power, k, m;

    TMP_INIT;
    TMP_START;

    if (N >= FACTORIAL_TAB_SIZE - 1)
    {
        flint_printf("_arb_exp_taylor_rs: N too large!\n");
        abort();
    }

    if (N <= 3)
    {
        if (N <= 1)
        {
            flint_mpn_zero(y, xn);
            y[xn] = N;
            error[0] = 0;
        }
        else if (N == 2)
        {
            flint_mpn_copyi(y, x, xn);
            y[xn] = 1;
            error[0] = 0;
        }
        else
        {
            /* 1 + x + x^2 / 2 */
            t = TMP_ALLOC_LIMBS(2 * xn);

            mpn_sqr(t, x, xn);
            mpn_rshift(t + xn, t + xn, xn, 1);
            y[xn] = mpn_add_n(y, x, t + xn, xn) + 1;

            error[0] = 2;
        }
    }
    else
    {
        /* Choose m ~= sqrt(num_terms) (m must be even, >= 2) */
        /* TODO: drop evenness assumption since we don't have sign issues here? */
        /* TODO: then just need to fix power construction below... */
        m = 2;
        while (m * m < N)
            m += 2;

        /* todo: merge allocations */
        xpow = TMP_ALLOC_LIMBS((m + 1) * xn);
        s = TMP_ALLOC_LIMBS(xn + 2);
        t = TMP_ALLOC_LIMBS(2 * xn + 2);     /* todo: 1 limb too much? */

        /* higher index ---> */
        /*        | ---xn--- | */
        /* xpow = |  <temp>  | x^m | x^(m-1) | ... | x^2 | x | */

#define XPOW_WRITE(__k) (xpow + (m - (__k)) * xn)
#define XPOW_READ(__k) (xpow + (m - (__k) + 1) * xn)

        flint_mpn_copyi(XPOW_READ(1), x, xn);
        mpn_sqr(XPOW_WRITE(2), XPOW_READ(1), xn);

        for (k = 4; k <= m; k += 2)
        {
            mpn_mul_n(XPOW_WRITE(k - 1), XPOW_READ(k / 2), XPOW_READ(k / 2 - 1), xn);
            mpn_sqr(XPOW_WRITE(k), XPOW_READ(k / 2), xn);
        }

        flint_mpn_zero(s, xn + 1);

        /* todo: skip one nonscalar multiplication (use x^m)
           when starting on x^0 */
        power = (N - 1) % m;

        for (k = N - 1; k >= 0; k--)
        {
            c = factorial_tab_numer[k];
            new_denom = factorial_tab_denom[k];
            old_denom = factorial_tab_denom[k+1];

            /* change denominators */
            if (new_denom != old_denom && k < N - 1)
            {
                mpn_divrem_1(s, 0, s, xn + 1, old_denom);
            }

            if (power == 0)
            {
                /* add c * x^0 -- only top limb is affected */
                s[xn] += c;

                /* Outer polynomial evaluation: multiply by x^m */
                if (k != 0)
                {
                    mpn_mul(t, s, xn + 1, XPOW_READ(m), xn);
                    flint_mpn_copyi(s, t + xn, xn + 1);
                }

                power = m - 1;
            }
            else
            {
                s[xn] += mpn_addmul_1(s, XPOW_READ(power), xn, c);

                power--;
            }
        }

        /* finally divide by denominator */
        mpn_divrem_1(y, 0, s, xn + 1, factorial_tab_denom[0]);

        /* error bound (ulp) */
        error[0] = 2;
    }

    TMP_END;
}