/* Input is {ap,rn}; output is {rp,rn}, computation is
   mod B^rn - 1, and values are semi-normalised; zero is represented
   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
   tp==rp is allowed. */
static void
mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
{
  mp_limb_t cy;

  ASSERT (0 < rn);

  mpn_sqr (tp, ap, rn);
  cy = mpn_add_n (rp, tp, tp + rn, rn);
  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
   * be no overflow when adding in the carry. */
  MPN_INCR_U (rp, rn, cy);
}
Exemple #2
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
    {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	/* n > 1 here */
	i = n;
	do
	  xp[--i] = GMP_NUMB_MAX;
	while (i);
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
    }
  else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/
	if (LIKELY(e)) /* The high part can not give a carry by itself. */
	  e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */
	/* If the value was wrong (no carry), correct it (increment). */
	e ^= CNST_LIMB (1);
	MPN_INCR_U (ip, n, e);
      }
  }
}
Exemple #3
0
void
mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (n == 1)
    invert_limb (*ip, *dp);
  else {
    TMP_DECL;

    TMP_MARK;
    if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
      {
	/* Maximum scratch needed by this branch: 2*n */
	mp_size_t i;
	mp_ptr xp;

	xp = scratch;				/* 2 * n limbs */
	for (i = n - 1; i >= 0; i--)
	  xp[i] = GMP_NUMB_MAX;
	mpn_com (xp + n, dp, n);
	if (n == 2) {
	  mpn_divrem_2 (ip, 0, xp, 4, dp);
	} else {
	  gmp_pi1_t inv;
	  invert_pi1 (inv, dp[n-1], dp[n-2]);
	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
	}
      }
    else { /* Use approximated inverse; correct the result if needed. */
      mp_limb_t e; /* The possible error in the approximate inverse */

      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
      e = mpn_ni_invertappr (ip, dp, n, scratch);

      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
	/* Code to detect and correct the "off by one" approximation. */
	mpn_mul_n (scratch, ip, dp, n);
	ASSERT_NOCARRY (mpn_add_n (scratch + n, scratch + n, dp, n));
	if (! mpn_add (scratch, scratch, 2*n, dp, n))
	  MPN_INCR_U (ip, n, 1); /* The value was wrong, correct it.  */
      }
    }
    TMP_FREE;
  }
}
/* Input is {ap,rn+1}; output is {rp,rn+1}, in
   semi-normalised representation, computation is mod B^rn + 1. Needs
   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
   Output is normalised. */
static void
mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
{
  mp_limb_t cy;

  ASSERT (0 < rn);

  mpn_sqr (tp, ap, rn + 1);
  ASSERT (tp[2*rn+1] == 0);
  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
  rp[rn] = 0;
  MPN_INCR_U (rp, rn+1, cy );
}
Exemple #5
0
REGPARM_ATTR (1) static void
cfdiv_r_2exp (mpz_ptr w, mpz_srcptr u, unsigned long cnt, int dir)
{
  mp_size_t  usize, abs_usize, limb_cnt, i;
  mp_srcptr  up;
  mp_ptr     wp;
  mp_limb_t  high;

  usize = SIZ(u);
  if (usize == 0)
    {
      SIZ(w) = 0;
      return;
    }

  limb_cnt = cnt / GMP_NUMB_BITS;
  cnt %= GMP_NUMB_BITS;
  abs_usize = ABS (usize);

  /* MPZ_REALLOC(w) below is only when w!=u, so we can fetch PTR(u) here
     nice and early */
  up = PTR(u);

  if ((usize ^ dir) < 0)
    {
      /* Round towards zero, means just truncate */

      if (w == u)
        {
          /* if already smaller than limb_cnt then do nothing */
          if (abs_usize <= limb_cnt)
            return;
          wp = PTR(w);
        }
      else
        {
          i = MIN (abs_usize, limb_cnt+1);
          MPZ_REALLOC (w, i);
          wp = PTR(w);
          MPN_COPY (wp, up, i);

          /* if smaller than limb_cnt then only the copy is needed */
          if (abs_usize <= limb_cnt)
            {
              SIZ(w) = usize;
              return;
            }
        }
    }
  else
    {
      /* Round away from zero, means twos complement if non-zero */

      /* if u!=0 and smaller than divisor, then must negate */
      if (abs_usize <= limb_cnt)
        goto negate;

      /* if non-zero low limb, then must negate */
      for (i = 0; i < limb_cnt; i++)
        if (up[i] != 0)
          goto negate;

      /* if non-zero partial limb, then must negate */
      if ((up[limb_cnt] & LOW_MASK (cnt)) != 0)
        goto negate;

      /* otherwise low bits of u are zero, so that's the result */
      SIZ(w) = 0;
      return;

    negate:
      /* twos complement negation to get 2**cnt-u */

      MPZ_REALLOC (w, limb_cnt+1);
      up = PTR(u);
      wp = PTR(w);

      /* Ones complement */
      i = MIN (abs_usize, limb_cnt+1);
      mpn_com_n (wp, up, i);
      for ( ; i <= limb_cnt; i++)
        wp[i] = GMP_NUMB_MAX;

      /* Twos complement.  Since u!=0 in the relevant part, the twos
         complement never gives 0 and a carry, so can use MPN_INCR_U. */
      MPN_INCR_U (wp, limb_cnt+1, CNST_LIMB(1));

      usize = -usize;
    }

  /* Mask the high limb */
  high = wp[limb_cnt];
  high &= LOW_MASK (cnt);
  wp[limb_cnt] = high;

  /* Strip any consequent high zeros */
  while (high == 0)
    {
      limb_cnt--;
      if (limb_cnt < 0)
        {
          SIZ(w) = 0;
          return;
        }
      high = wp[limb_cnt];
    }

  limb_cnt++;
  SIZ(w) = (usize >= 0 ? limb_cnt : -limb_cnt);
}
Exemple #6
0
/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
 *
 * The result is expected to be ZERO if and only if one of the operand
 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
 * combine results and obtain a natural number when one knows in
 * advance that the final value is less than (B^rn-1).
 * Moreover it should not be a problem if mulmod_bnm1 is used to
 * compute the full product with an+bn <= rn, because this condition
 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
 *
 * Requires 0 < bn <= an <= rn and an + bn > rn/2
 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
 *
 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
 */
void
mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
{
  ASSERT (0 < bn);
  ASSERT (bn <= an);
  ASSERT (an <= rn);

  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
    {
      if (UNLIKELY (bn < rn))
	{
	  if (UNLIKELY (an + bn <= rn))
	    {
	      mpn_mul (rp, ap, an, bp, bn);
	    }
	  else
	    {
	      mp_limb_t cy;
	      mpn_mul (tp, ap, an, bp, bn);
	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
	      MPN_INCR_U (rp, rn, cy);
	    }
	}
      else
	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
    }
  else
    {
      mp_size_t n;
      mp_limb_t cy;
      mp_limb_t hi;

      n = rn >> 1;

      /* We need at least an + bn >= n, to be able to fit one of the
	 recursive products at rp. Requiring strict inequality makes
	 the coded slightly simpler. If desired, we could avoid this
	 restriction by initially halving rn as long as rn is even and
	 an + bn <= rn/2. */

      ASSERT (an + bn > n);

      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
	 and crt together as

	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
      */

#define a0 ap
#define a1 (ap + n)
#define b0 bp
#define b1 (bp + n)

#define xp  tp	/* 2n + 2 */
      /* am1  maybe in {xp, n} */
      /* bm1  maybe in {xp + n, n} */
#define sp1 (tp + 2*n + 2)
      /* ap1  maybe in {sp1, n + 1} */
      /* bp1  maybe in {sp1 + n + 1, n + 1} */

      {
	mp_srcptr am1, bm1;
	mp_size_t anm, bnm;
	mp_ptr so;

	bm1 = b0;
	bnm = bn;
	if (LIKELY (an > n))
	  {
	    am1 = xp;
	    cy = mpn_add (xp, a0, n, a1, an - n);
	    MPN_INCR_U (xp, n, cy);
	    anm = n;
	    so = xp + n;
	    if (LIKELY (bn > n))
	      {
		bm1 = so;
		cy = mpn_add (so, b0, n, b1, bn - n);
		MPN_INCR_U (so, n, cy);
		bnm = n;
		so += n;
	      }
	  }
	else
	  {
	    so = xp;
	    am1 = a0;
	    anm = an;
	  }

	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
      }

      {
	int       k;
	mp_srcptr ap1, bp1;
	mp_size_t anp, bnp;

	bp1 = b0;
	bnp = bn;
	if (LIKELY (an > n)) {
	  ap1 = sp1;
	  cy = mpn_sub (sp1, a0, n, a1, an - n);
	  sp1[n] = 0;
	  MPN_INCR_U (sp1, n + 1, cy);
	  anp = n + ap1[n];
	  if (LIKELY (bn > n)) {
	    bp1 = sp1 + n + 1;
	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
	    sp1[2*n+1] = 0;
	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
	    bnp = n + bp1[n];
	  }
	} else {
	  ap1 = a0;
	  anp = an;
	}

	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
	  k=0;
	else
	  {
	    int mask;
	    k = mpn_fft_best_k (n, 0);
	    mask = (1<<k) - 1;
	    while (n & mask) {k--; mask >>=1;};
	  }
	if (k >= FFT_FIRST_K)
	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
	else if (UNLIKELY (bp1 == b0))
	  {
	    ASSERT (anp + bnp <= 2*n+1);
	    ASSERT (anp + bnp > n);
	    ASSERT (anp >= bnp);
	    mpn_mul (xp, ap1, anp, bp1, bnp);
	    anp = anp + bnp - n;
	    ASSERT (anp <= n || xp[2*n]==0);
	    anp-= anp > n;
	    cy = mpn_sub (xp, xp, n, xp + n, anp);
	    xp[n] = 0;
	    MPN_INCR_U (xp, n+1, cy);
	  }
	else
	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
      }

      /* Here the CRT recomposition begins.

	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
	 Division by 2 is a bitwise rotation.

	 Assumes xp normalised mod (B^n+1).

	 The residue class [0] is represented by [B^n-1]; except when
	 both input are ZERO.
      */

#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
#if HAVE_NATIVE_mpn_rsh1add_nc
      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
      hi = cy << (GMP_NUMB_BITS - 1);
      cy = 0;
      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
	 overflows, i.e. a further increment will not overflow again. */
#else /* ! _nc */
      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
#endif
#if GMP_NAIL_BITS == 0
      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
#else
      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
      rp[n-1] ^= hi;
#endif
#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
#if HAVE_NATIVE_mpn_add_nc
      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
#else /* ! _nc */
      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
#endif
      cy += (rp[0]&1);
      mpn_rshift(rp, rp, n, 1);
      ASSERT (cy <= 2);
      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
      cy >>= 1;
      /* We can have cy != 0 only if hi = 0... */
      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
      rp[n-1] |= hi;
      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
#endif
      ASSERT (cy <= 1);
      /* Next increment can not overflow, read the previous comments about cy. */
      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
      MPN_INCR_U(rp, n, cy);

      /* Compute the highest half:
	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
       */
      if (UNLIKELY (an + bn < rn))
	{
	  /* Note that in this case, the only way the result can equal
	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
	     then the output of both the recursive calls and this CRT
	     reconstruction is zero, not B^{rn} - 1. Which is good,
	     since the latter representation doesn't fit in the output
	     area.*/
	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);

	  /* FIXME: This subtraction of the high parts is not really
	     necessary, we do it to get the carry out, and for sanity
	     checking. */
	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
				   xp + an + bn - n, rn - (an + bn), cy);
	  ASSERT (an + bn == rn - 1 ||
		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
	  ASSERT (cy == (xp + an + bn - n)[0]);
	}
      else
	{
	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
	     DECR will affect _at most_ the lowest n limbs. */
	  MPN_DECR_U (rp, 2*n, cy);
	}
#undef a0
#undef a1
#undef b0
#undef b1
#undef xp
#undef sp1
    }
}
void
mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
			   mp_size_t w6n, mp_ptr tp)
{
  mp_size_t m;
  mp_limb_t cy;

  m = 2*n + 1;
#define w0 rp
#define w2 (rp + 2*n)
#define w6 (rp + 6*n)

  ASSERT (w6n > 0);
  ASSERT (w6n <= 2*n);

  /* Using formulas similar to Marco Bodrato's

     W5 = W5 + W4
     W1 =(W4 - W1)/2
     W4 = W4 - W0
     W4 =(W4 - W1)/4 - W6*16
     W3 =(W2 - W3)/2
     W2 = W2 - W3

     W5 = W5 - W2*65      May be negative.
     W2 = W2 - W6 - W0
     W5 =(W5 + W2*45)/2   Now >= 0 again.
     W4 =(W4 - W2)/3
     W2 = W2 - W4

     W1 = W5 - W1         May be negative.
     W5 =(W5 - W3*8)/9
     W3 = W3 - W5
     W1 =(W1/15 + W5)/2   Now >= 0 again.
     W5 = W5 - W1

     where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
	   W4 = f(2), W5 = f(1/2), W6 = f(oo),

     Note that most intermediate results are positive; the ones that
     may be negative are represented in two's complement. We must
     never shift right a value that may be negative, since that would
     invalidate the sign bit. On the other hand, divexact by odd
     numbers work fine with two's complement.
  */

  mpn_add_n (w5, w5, w4, m);
  if (flags & toom7_w1_neg)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w1, w1, w4, m);
#else
      mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
      mpn_rshift (w1, w1, m, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w1, w4, w1, m);
#else
      mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
      mpn_rshift (w1, w1, m, 1);
#endif
    }
  mpn_sub (w4, w4, m, w0, 2*n);
  mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
  mpn_rshift (w4, w4, m, 2); /* w4>=0 */

  tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
  mpn_sub (w4, w4, m, tp, w6n+1);

  if (flags & toom7_w3_neg)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w3, w3, w2, m);
#else
      mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
      mpn_rshift (w3, w3, m, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w3, w2, w3, m);
#else
      mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
      mpn_rshift (w3, w3, m, 1);
#endif
    }

  mpn_sub_n (w2, w2, w3, m);

  mpn_submul_1 (w5, w2, m, 65);
  mpn_sub (w2, w2, m, w6, w6n);
  mpn_sub (w2, w2, m, w0, 2*n);

  mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
  mpn_rshift (w5, w5, m, 1);
  mpn_sub_n (w4, w4, w2, m);

  mpn_divexact_by3 (w4, w4, m);
  mpn_sub_n (w2, w2, w4, m);

  mpn_sub_n (w1, w5, w1, m);
  mpn_lshift (tp, w3, m, 3);
  mpn_sub_n (w5, w5, tp, m);
  mpn_divexact_by9 (w5, w5, m);
  mpn_sub_n (w3, w3, w5, m);

  mpn_divexact_by15 (w1, w1, m);
  mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
  mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
  mpn_sub_n (w5, w5, w1, m);

  /* These bounds are valid for the 4x4 polynomial product of toom44,
   * and they are conservative for toom53 and toom62. */
  ASSERT (w1[2*n] < 2);
  ASSERT (w2[2*n] < 3);
  ASSERT (w3[2*n] < 4);
  ASSERT (w4[2*n] < 3);
  ASSERT (w5[2*n] < 2);

  /* Addition chain. Note carries and the 2n'th limbs that need to be
   * added in.
   *
   * Special care is needed for w2[2n] and the corresponding carry,
   * since the "simple" way of adding it all together would overwrite
   * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
   * the high half of w3 and the low half of w4.
   *
   *         7    6    5    4    3    2    1    0
   *    |    |    |    |    |    |    |    |    |
   *                  ||w3 (2n+1)|
   *             ||w4 (2n+1)|
   *        ||w5 (2n+1)|        ||w1 (2n+1)|
   *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
   *  -----------------------------------------------
   *  r |    |    |    |    |    |    |    |    |
   *        c7   c6   c5   c4   c3                 Carries to propagate
   */

  cy = mpn_add_n (rp + n, rp + n, w1, m);
  MPN_INCR_U (w2 + n + 1, n , cy);
  cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
  MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
  cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
  MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
  cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
  MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
  if (w6n > n + 1)
    ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1));
  else
    {
      ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
#if WANT_ASSERT
      {
	mp_size_t i;
	for (i = w6n; i <= n; i++)
	  ASSERT (w5[n + i] == 0);
      }
#endif
    }
}
void
mpn_toom44_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  mp_limb_t cy;
  enum toom7_flags flags;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)
#define b3  (bp + 3*n)

  ASSERT (an >= bn);

  n = (an + 3) >> 2;

  s = an - 3 * n;
  t = bn - 3 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);
  ASSERT (s >= t);

  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
   * following limb, so these must be computed in order, and we need a
   * one limb gap to tp. */
#define v0    pp				/* 2n */
#define v1    (pp + 2 * n)			/* 2n+1 */
#define vinf  (pp + 6 * n)			/* s+t */
#define v2    scratch				/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
#define tp (scratch + 8*n + 5)

  /* apx and bpx must not overlap with v1 */
#define apx   pp				/* n+1 */
#define amx   (pp + n + 1)			/* n+1 */
#define bmx   (pp + 2*n + 2)			/* n+1 */
#define bpx   (pp + 4*n + 2)			/* n+1 */

  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
     gives roughly 32 n/3 + log term. */

  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
  flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));

  /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
  flags = (enum toom7_flags) (flags ^ toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp));

  TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
  TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */

  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (apx, a1, a0, n);
  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
  if (s < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
      MPN_INCR_U (apx + s, n+1-s, cy2);
    }
  else
    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
#else
  cy = mpn_lshift (apx, a0, n, 1);
  cy += mpn_add_n (apx, apx, a1, n);
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
  cy += mpn_add_n (apx, apx, a2, n);
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
#endif

  /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (bpx, b1, b0, n);
  cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
  if (t < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
      bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
      MPN_INCR_U (bpx + t, n+1-t, cy2);
    }
  else
    bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
#else
  cy = mpn_lshift (bpx, b0, n, 1);
  cy += mpn_add_n (bpx, bpx, b1, n);
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
  cy += mpn_add_n (bpx, bpx, b2, n);
  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
  bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
#endif

  ASSERT (apx[n] < 15);
  ASSERT (bpx[n] < 15);

  TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */

  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
  flags = (enum toom7_flags) (flags | toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp));

  /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3.  */
  flags = (enum toom7_flags) (flags ^ toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp));

  TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
  /* Clobbers amx, bmx. */
  TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */

  TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
  if (s > t)
    mpn_mul (vinf, a3, s, b3, t);
  else
    TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */

  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
}
Exemple #9
0
void
mpn_toom53_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn,
		mp_ptr scratch)
{
  mp_size_t n, s, t;
  mp_limb_t cy;
  mp_ptr gp;
  mp_ptr as1, asm1, as2, asm2, ash;
  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
  mp_ptr tmp;
  enum toom7_flags flags;
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)
#define a4  (ap + 4*n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2*n)

  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);

  s = an - 4 * n;
  t = bn - 2 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

  tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
  as1  = tmp; tmp += n + 1;
  asm1 = tmp; tmp += n + 1;
  as2  = tmp; tmp += n + 1;
  asm2 = tmp; tmp += n + 1;
  ash  = tmp; tmp += n + 1;
  bs1  = tmp; tmp += n + 1;
  bsm1 = tmp; tmp += n + 1;
  bs2  = tmp; tmp += n + 1;
  bsm2 = tmp; tmp += n + 1;
  bsh  = tmp; tmp += n + 1;

  gp = pp;

  /* Compute as1 and asm1.  */
  flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));

  /* Compute as2 and asm2. */
  flags = (enum toom7_flags) (flags | toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp));

  /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
     = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
#if HAVE_NATIVE_mpn_addlsh1_n
  cy = mpn_addlsh1_n (ash, a1, a0, n);
  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
  if (s < n)
    {
      mp_limb_t cy2;
      cy2 = mpn_addlsh1_n (ash, a4, ash, s);
      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
      MPN_INCR_U (ash + s, n+1-s, cy2);
    }
  else
    ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
#else
  cy = mpn_lshift (ash, a0, n, 1);
  cy += mpn_add_n (ash, ash, a1, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  cy += mpn_add_n (ash, ash, a2, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  cy += mpn_add_n (ash, ash, a3, n);
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
  ash[n] = cy + mpn_add (ash, ash, n, a4, s);
#endif

  /* Compute bs1 and bsm1.  */
  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
#if HAVE_NATIVE_mpn_add_n_sub_n
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
    {
      bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
      bsm1[n] = 0;
      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
    }
Exemple #10
0
void
check_incr_data (void)
{
  static const struct {
    mp_limb_t        n;
    const mp_limb_t  src[SIZE];
    const mp_limb_t  want[SIZE];
  } data[] = {
    { 1, { 0 },   { 1 } },
    { 1, { 123 }, { 124 } },
    { 2, { 0 },   { 2 } },
    { 2, { 123 }, { 125 } },
    { M, { 0 },   { M } },

    { 1, { M, 0 },   { 0,   1 } },
    { 1, { M, 123 }, { 0,   124 } },
    { 2, { M, 0 },   { 1,   1 } },
    { 2, { M, 123 }, { 1,   124 } },
    { M, { M, 0 },   { M-1, 1 } },
    { M, { M, 123 }, { M-1, 124 } },

    { 1, { M, M, 0 },   { 0,   0, 1 } },
    { 1, { M, M, 123 }, { 0,   0, 124 } },
    { 2, { M, M, 0 },   { 1,   0, 1 } },
    { 2, { M, M, 123 }, { 1,   0, 124 } },
    { M, { M, M, 0 },   { M-1, 0, 1 } },
    { M, { M, M, 123 }, { M-1, 0, 124 } },

    { 1, { M, M, M, 0 },   { 0,   0, 0, 1 } },
    { 1, { M, M, M, 123 }, { 0,   0, 0, 124 } },
    { 2, { M, M, M, 0 },   { 1,   0, 0, 1 } },
    { 2, { M, M, M, 123 }, { 1,   0, 0, 124 } },
    { M, { M, M, M, 0 },   { M-1, 0, 0, 1 } },
    { M, { M, M, M, 123 }, { M-1, 0, 0, 124 } },

    { 1, { M, M, M, M, 0 },   { 0,   0, 0, 0, 1 } },
    { 1, { M, M, M, M, 123 }, { 0,   0, 0, 0, 124 } },
    { 2, { M, M, M, M, 0 },   { 1,   0, 0, 0, 1 } },
    { 2, { M, M, M, M, 123 }, { 1,   0, 0, 0, 124 } },
    { M, { M, M, M, M, 0 },   { M-1, 0, 0, 0, 1 } },
    { M, { M, M, M, M, 123 }, { M-1, 0, 0, 0, 124
#if defined (__hpux) && ! defined (__GNUC__)
    /* Some versions (at least HP92453-01 B.11.11.23709.GP) of the
       HP C compilers fail to zero-fill aggregates as the ISO C standard
       requires (cf 6.5.7 Initialization).  Compensate here:  */
				, 0, 0, 0, 0, 0
#endif
    } }
  };

  mp_limb_t  got[SIZE];
  int   i;

  for (i = 0; i < numberof (data); i++)
    {
      refmpn_copyi (got, data[i].src, SIZE);
      MPN_INCR_U (got, SIZE, data[i].n);
      check_one ("check_incr (general)", i,
                 data[i].src, data[i].n,
                 got, data[i].want, SIZE);

      if (data[i].n == 1)
        {
          refmpn_copyi (got, data[i].src, SIZE);
          MPN_INCR_U (got, SIZE, CNST_LIMB(1));
          check_one ("check_incr (const 1)", i,
                     data[i].src, data[i].n,
                     got, data[i].want, SIZE);
        }
    }
}
Exemple #11
0
void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
{
  mp_size_t  sign, wu, xu, ws, new_wn, min_size, dsize;
  mp_limb_t  cy;

  /* w unaffected if x==0 or y==0 */
  if (xn == 0 || y == 0)
    return;

  sign = xn;
  xu = ABS (xn);

  ws = *wn;
  if (*wn == 0)
  {
      /* nothing to add to, just set x*y, "sign" gives the sign */
      cy = mpn_mul_1 (wp, xp, xu, y);
      if (cy)
		{
			wp[xu] = cy;
         xu = xu + 1;
		} 
      *wn = (sign >= 0 ? xu : -xu);
      return;
  }
  
  sign ^= *wn;
  wu = ABS (*wn);

  new_wn = MAX (wu, xu);
  min_size = MIN (wu, xu);

  if (sign >= 0)
  {
      /* addmul of absolute values */

      cy = mpn_addmul_1 (wp, xp, min_size, y);
      
      dsize = xu - wu;
#if HAVE_NATIVE_mpn_mul_1c
      if (dsize > 0)
        cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
      else if (dsize < 0)
      {
          dsize = -dsize;
          cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
      }
#else
      if (dsize != 0)
      {
          mp_limb_t cy2;
          if (dsize > 0)
            cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
          else
          {
              dsize = -dsize;
              cy2 = 0;
          }
          cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
      }
#endif

      if (cy)
		{
			wp[dsize + min_size] = cy;
         new_wn ++;
		}
   } else
   {
      /* submul of absolute values */

      cy = mpn_submul_1 (wp, xp, min_size, y);
      if (wu >= xu)
      {
          /* if w bigger than x, then propagate borrow through it */
          if (wu != xu)
            cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);

          if (cy != 0)
          {
              /* Borrow out of w, take twos complement negative to get
                 absolute value, flip sign of w.  */
              wp[new_wn] = ~-cy;  /* extra limb is 0-cy */
              mpn_not (wp, new_wn);
              new_wn++;
              MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
              ws = -*wn;
          }
      } else /* wu < xu */
      {
          /* x bigger than w, so want x*y-w.  Submul has given w-x*y, so
             take twos complement and use an mpn_mul_1 for the rest.  */

          mp_limb_t  cy2;

          /* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
          mpn_not (wp, wu);
          cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
          cy -= 1;

          /* If cy-1 == -1 then hold that -1 for latter.  mpn_submul_1 never
             returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
          cy2 = (cy == MP_LIMB_T_MAX);
          cy += cy2;
          MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
          wp[new_wn] = cy;
          new_wn += (cy != 0);

          /* Apply any -1 from above.  The value at wp+wsize is non-zero
             because y!=0 and the high limb of x will be non-zero.  */
          if (cy2)
            MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));

          ws = -*wn;
        }

      /* submul can produce high zero limbs due to cancellation, both when w
         has more limbs or x has more  */
      MPN_NORMALIZE (wp, new_wn);
  }

  *wn = (ws >= 0 ? new_wn : -new_wn);

  ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
}
/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */
int
mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k,
		      mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift,
		      mp_ptr tp)
{
  unsigned i;
  int neg;
#ifdef HAVE_NATIVE_mpn_addlsh_n
  mp_limb_t cy;
#endif

  ASSERT (k >= 3);
  ASSERT (shift*k < GMP_NUMB_BITS);

  ASSERT (hn > 0);
  ASSERT (hn <= n);

  /* The degree k is also the number of full-size coefficients, so
   * that last coefficient, of size hn, starts at xp + k*n. */

#ifdef HAVE_NATIVE_mpn_addlsh_n
  xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift);
  for (i = 4; i < k; i += 2)
    xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift);

  tp[n] = mpn_lshift (tp, xp+n, n, shift);
  for (i = 3; i < k; i+= 2)
    tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift);

  if (k & 1)
    {
      cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift);
      MPN_INCR_U (tp + hn, n+1 - hn, cy);
    }
  else
    {
      cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift);
      MPN_INCR_U (xp2 + hn, n+1 - hn, cy);
    }

#else /* !HAVE_NATIVE_mpn_addlsh_n */
  xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift);
  xp2[n] += mpn_add_n (xp2, xp, tp, n);
  for (i = 4; i < k; i += 2)
    {
      xp2[n] += mpn_lshift (tp, xp + ((mp_size_t) i)*n, n, i*shift);
      xp2[n] += mpn_add_n (xp2, xp2, tp, n);
    }

  tp[n] = mpn_lshift (tp, xp+n, n, shift);
  for (i = 3; i < k; i+= 2)
    {
      tp[n] += mpn_lshift (xm2, xp + ((mp_size_t) i)*n, n, i*shift);
      tp[n] += mpn_add_n (tp, tp, xm2, n);
    }

  xm2[hn] = mpn_lshift (xm2, xp + ((mp_size_t) k)*n, hn, k*shift);
  if (k & 1)
    mpn_add (tp, tp, n+1, xm2, hn+1);
  else
    mpn_add (xp2, xp2, n+1, xm2, hn+1);
#endif /* !HAVE_NATIVE_mpn_addlsh_n */

  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;

#ifdef HAVE_NATIVE_mpn_add_n_sub_n
  if (neg)
    mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
  else
    mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
  if (neg)
    mpn_sub_n (xm2, tp, xp2, n + 1);
  else
    mpn_sub_n (xm2, xp2, tp, n + 1);

  mpn_add_n (xp2, xp2, tp, n + 1);
#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */

  /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */
  ASSERT ((k+1)*shift >= GMP_LIMB_BITS ||
	  xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1));
  ASSERT ((k+2)*shift >= GMP_LIMB_BITS ||
	  xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1));

  return neg;
}
Exemple #13
0
void
mpn_toom22_mul (mp_ptr pp,
                mp_srcptr ap, mp_size_t an,
                mp_srcptr bp, mp_size_t bn,
                mp_ptr scratch)
{
    const int __gmpn_cpuvec_initialized = 1;
    mp_size_t n, s, t;
    int vm1_neg;
    mp_limb_t cy, cy2;
    mp_ptr asm1;
    mp_ptr bsm1;

#define a0  ap
#define a1  (ap + n)
#define b0  bp
#define b1  (bp + n)

    s = an >> 1;
    n = an - s;
    t = bn - n;

    ASSERT (an >= bn);

    ASSERT (0 < s && s <= n && s >= n - 1);
    ASSERT (0 < t && t <= s);

    asm1 = pp;
    bsm1 = pp + n;

    vm1_neg = 0;

    /* Compute asm1.  */
    if (s == n)
    {
        if (mpn_cmp (a0, a1, n) < 0)
        {
            mpn_sub_n (asm1, a1, a0, n);
            vm1_neg = 1;
        }
        else
        {
            mpn_sub_n (asm1, a0, a1, n);
        }
    }
    else /* n - s == 1 */
    {
        if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0)
        {
            mpn_sub_n (asm1, a1, a0, s);
            asm1[s] = 0;
            vm1_neg = 1;
        }
        else
        {
            asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s);
        }
    }

    /* Compute bsm1.  */
    if (t == n)
    {
        if (mpn_cmp (b0, b1, n) < 0)
        {
            mpn_sub_n (bsm1, b1, b0, n);
            vm1_neg ^= 1;
        }
        else
        {
            mpn_sub_n (bsm1, b0, b1, n);
        }
    }
    else
    {
        if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
        {
            mpn_sub_n (bsm1, b1, b0, t);
            MPN_ZERO (bsm1 + t, n - t);
            vm1_neg ^= 1;
        }
        else
        {
            mpn_sub (bsm1, b0, n, b1, t);
        }
    }

#define v0	pp				/* 2n */
#define vinf	(pp + 2 * n)			/* s+t */
#define vm1	scratch				/* 2n */
#define scratch_out	scratch + 2 * n

    /* vm1, 2n limbs */
    TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);

    if (s > t)  TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out);
    else        TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out);

    /* v0, 2n limbs */
    TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out);

    /* H(v0) + L(vinf) */
    cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);

    /* L(v0) + H(v0) */
    cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n);

    /* L(vinf) + H(vinf) */
    cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n);

    if (vm1_neg)
        cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n);
    else
        cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n);

    ASSERT (cy + 1  <= 3);
    ASSERT (cy2 <= 2);

    MPN_INCR_U (pp + 2 * n, s + t, cy2);
    if (LIKELY (cy <= 2))
        /* if s+t==n, cy is zero, but we should not acces pp[3*n] at all. */
        MPN_INCR_U (pp + 3 * n, s + t - n, cy);
    else
        MPN_DECR_U (pp + 3 * n, s + t - n, 1);
}
Exemple #14
0
mp_limb_t
mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
		   mp_limb_t d, mp_limb_t dinv)
{
  mp_limb_t B2;
  mp_limb_t u0, u2;
  mp_limb_t q0, q1;
  mp_limb_t p0, p1;
  mp_limb_t t;
  mp_size_t j;

  ASSERT (d & GMP_LIMB_HIGHBIT);
  ASSERT (n > 0);
  ASSERT (u1 < d);

  if (n == 1)
    {
      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
      return u1;
    }

  /* FIXME: Could be precomputed */
  B2 = -d*dinv;

  umul_ppmm (q1, q0, dinv, u1);
  umul_ppmm (p1, p0, B2, u1);
  q1 += u1;
  ASSERT (q1 >= u1);
  u0 = up[n-1];	/* Early read, to allow qp == up. */
  qp[n-1] = q1;

  add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);

  /* FIXME: Keep q1 in a variable between iterations, to reduce number
     of memory accesses. */
  for (j = n-2; j-- > 0; )
    {
      mp_limb_t q2, cy;

      /* Additions for the q update:
       *	+-------+
       *        |u1 * v |
       *        +---+---+
       *        | u1|
       *    +---+---+
       *    | 1 | v |  (conditional on u2)
       *    +---+---+
       *        | 1 |  (conditional on u0 + u2 B2 carry)
       *        +---+
       * +      | q0|
       *   -+---+---+---+
       *    | q2| q1| q0|
       *    +---+---+---+
      */
      umul_ppmm (p1, t, u1, dinv);
      add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
      q0 = t;

      umul_ppmm (p1, p0, u1, B2);
      ADDC_LIMB (cy, u0, u0, u2 & B2);
      u0 -= (-cy) & d;

      /* Final q update */
      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
      qp[j+1] = q1;
      MPN_INCR_U (qp+j+2, n-j-2, q2);

      add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
    }

  q1 = (u2 > 0);
  u1 -= (-q1) & d;

  t = (u1 >= d);
  q1 += t;
  u1 -= (-t) & d;

  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);

  MPN_INCR_U (qp+1, n-1, q1);

  qp[0] = q0;
  return u0;
}
Exemple #15
0
void
mpz_combit (mpz_ptr d, mp_bitcnt_t bit_index)
{
  mp_size_t dsize = SIZ(d);
  mp_ptr dp = PTR(d);

  mp_size_t limb_index = bit_index / GMP_NUMB_BITS;
  mp_limb_t bit = (CNST_LIMB (1) << (bit_index % GMP_NUMB_BITS));

  /* Check for the most common case: Positive input, no realloc or
     normalization needed. */
  if (limb_index + 1 < dsize)
    dp[limb_index] ^= bit;

  /* Check for the hairy case. d < 0, and we have all zero bits to the
     right of the bit to toggle. */
  else if (limb_index < -dsize
	   && (limb_index == 0 || mpn_zero_p (dp, limb_index))
	   && (dp[limb_index] & (bit - 1)) == 0)
    {
      ASSERT (dsize < 0);
      dsize = -dsize;

      if (dp[limb_index] & bit)
	{
	  /* We toggle the least significant one bit. Corresponds to
	     an add, with potential carry propagation, on the absolute
	     value. */
	  dp = MPZ_REALLOC (d, 1 + dsize);
	  dp[dsize] = 0;
	  MPN_INCR_U (dp + limb_index, 1 + dsize - limb_index, bit);
	  SIZ(d) = - dsize - dp[dsize];
	}
      else
	{
	  /* We toggle a zero bit, subtract from the absolute value. */
	  MPN_DECR_U (dp + limb_index, dsize - limb_index, bit);
	  /* The absolute value shrinked by at most one bit. */
	  dsize -= dp[dsize - 1] == 0;
	  ASSERT (dsize > 0 && dp[dsize - 1] != 0);
	  SIZ (d) = -dsize;
	}
    }
  else
    {
      /* Simple case: Toggle the bit in the absolute value. */
      dsize = ABS(dsize);
      if (limb_index < dsize)
	{
	  mp_limb_t	 dlimb;
	  dlimb = dp[limb_index] ^ bit;
	  dp[limb_index] = dlimb;

	  /* Can happen only when limb_index = dsize - 1. Avoid SIZ(d)
	     bookkeeping in the common case. */
	  if (UNLIKELY ((dlimb == 0) + limb_index == dsize)) /* dsize == limb_index + 1 */
	    {
	      /* high limb became zero, must normalize */
	      MPN_NORMALIZE (dp, limb_index);
	      SIZ (d) = SIZ (d) >= 0 ? limb_index : -limb_index;
	    }
	}
      else
	{
	  dp = MPZ_REALLOC (d, limb_index + 1);
	  MPN_ZERO(dp + dsize, limb_index - dsize);
	  dp[limb_index++] = bit;
	  SIZ(d) = SIZ(d) >= 0 ? limb_index : -limb_index;
	}
    }
}
Exemple #16
0
static
unsigned long int
lc (mp_ptr rp, gmp_randstate_t rstate)
{
  mp_ptr tp, seedp, ap;
  mp_size_t ta;
  mp_size_t tn, seedn, an;
  unsigned long int m2exp;
  mp_limb_t c;
  TMP_DECL (mark);

  m2exp = rstate->_mp_algdata._mp_lc->_mp_m2exp;

  /* The code below assumes the mod part is a power of two.  Make sure
     that is the case.  */
  ASSERT_ALWAYS (m2exp != 0);

  c = (mp_limb_t) rstate->_mp_algdata._mp_lc->_mp_c;

  seedp = PTR (rstate->_mp_seed);
  seedn = SIZ (rstate->_mp_seed);

  if (seedn == 0)
    {
      /* Seed is 0.  Result is C % M.  Assume table is sensibly stored,
       with C smaller than M*/
      *rp = c;

      *seedp = c;
      SIZ (rstate->_mp_seed) = 1;
      return m2exp;
    }

  ap = PTR (rstate->_mp_algdata._mp_lc->_mp_a);
  an = SIZ (rstate->_mp_algdata._mp_lc->_mp_a);

  /* Allocate temporary storage.  Let there be room for calculation of
     (A * seed + C) % M, or M if bigger than that.  */

  TMP_MARK (mark);
  ta = an + seedn + 1;
  tp = (mp_ptr) TMP_ALLOC (ta * BYTES_PER_MP_LIMB);

  /* t = a * seed */
  if (seedn >= an)
    mpn_mul (tp, seedp, seedn, ap, an);
  else
    mpn_mul (tp, ap, an, seedp, seedn);
  tn = an + seedn;

  /* t = t + c */
  tp[tn] = 0;			/* sentinel, stops MPN_INCR_U */
  MPN_INCR_U (tp, tn, c);

  ASSERT_ALWAYS (m2exp / GMP_NUMB_BITS < ta);

  /* t = t % m */
  tp[m2exp / GMP_NUMB_BITS] &= ((mp_limb_t) 1 << m2exp % GMP_NUMB_BITS) - 1;
  tn = (m2exp + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS;

  /* Save result as next seed.  */
  MPN_COPY (PTR (rstate->_mp_seed), tp, tn);
  SIZ (rstate->_mp_seed) = tn;

  {
    /* Discard the lower m2exp/2 bits of result.  */
    unsigned long int bits = m2exp / 2;
    mp_size_t xn = bits / GMP_NUMB_BITS;

    tn -= xn;
    if (tn > 0)
      {
	unsigned int cnt = bits % GMP_NUMB_BITS;
	if (cnt != 0)
	  { 
	    mpn_rshift (tp, tp + xn, tn, cnt);
	    MPN_COPY_INCR (rp, tp, xn + 1); 
	  }
	else			/* Even limb boundary.  */
	  MPN_COPY_INCR (rp, tp + xn, tn);
      }
  }

  TMP_FREE (mark);

  /* Return number of valid bits in the result.  */
  return (m2exp + 1) / 2;
}
void
mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
			   mp_size_t w0n)
{
  mp_limb_t cy;
  /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
  mp_limb_t cy4, cy6, embankment;

  ASSERT( n > 0 );
  ASSERT( 2*n >= w0n && w0n > 0 );

#define w5  pp					/* 2n   */
#define w3  (pp + 2 * n)			/* 2n+1 */
#define w0  (pp + 5 * n)			/* w0n  */

  /* Interpolate with sequence:
     W2 =(W1 - W2)>>2
     W1 =(W1 - W5)>>1
     W1 =(W1 - W2)>>1
     W4 =(W3 - W4)>>1
     W2 =(W2 - W4)/3
     W3 = W3 - W4 - W5
     W1 =(W1 - W3)/3
     // Last steps are mixed with recomposition...
     W2 = W2 - W0<<2
     W4 = W4 - W2
     W3 = W3 - W1
     W2 = W2 - W0
  */

  /* W2 =(W1 - W2)>>2 */
  if (flags & toom6_vm2_neg)
    mpn_add_n (w2, w1, w2, 2 * n + 1);
  else
    mpn_sub_n (w2, w1, w2, 2 * n + 1);
  mpn_rshift (w2, w2, 2 * n + 1, 2);

  /* W1 =(W1 - W5)>>1 */
  w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
  mpn_rshift (w1, w1, 2 * n + 1, 1);

  /* W1 =(W1 - W2)>>1 */
#if HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
#else
  mpn_sub_n (w1, w1, w2, 2 * n + 1);
  mpn_rshift (w1, w1, 2 * n + 1, 1);
#endif

  /* W4 =(W3 - W4)>>1 */
  if (flags & toom6_vm1_neg)
    {
#if HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
#else
      mpn_add_n (w4, w3, w4, 2 * n + 1);
      mpn_rshift (w4, w4, 2 * n + 1, 1);
#endif
    }
  else
    {
#if HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
#else
      mpn_sub_n (w4, w3, w4, 2 * n + 1);
      mpn_rshift (w4, w4, 2 * n + 1, 1);
#endif
    }

  /* W2 =(W2 - W4)/3 */
  mpn_sub_n (w2, w2, w4, 2 * n + 1);
  mpn_divexact_by3 (w2, w2, 2 * n + 1);

  /* W3 = W3 - W4 - W5 */
  mpn_sub_n (w3, w3, w4, 2 * n + 1);
  w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);

  /* W1 =(W1 - W3)/3 */
  mpn_sub_n (w1, w1, w3, 2 * n + 1);
  mpn_divexact_by3 (w1, w1, 2 * n + 1);

  /*
    [1 0 0 0 0 0;
     0 1 0 0 0 0;
     1 0 1 0 0 0;
     0 1 0 1 0 0;
     1 0 1 0 1 0;
     0 0 0 0 0 1]

    pp[] prior to operations:
     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|

    summation scheme for remaining operations:
     |______________5|n_____4|n_____3|n_____2|n______|n______|pp
     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
				    || H w4  | L w4  |
		    || H w2  | L w2  |
	    || H w1  | L w1  |
			    ||-H w1  |-L w1  |
		     |-H w0  |-L w0 ||-H w2  |-L w2  |
  */
  cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
  MPN_INCR_U (pp + 3 * n + 1, n, cy);

  /* W2 -= W0<<2 */
#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
#if HAVE_NATIVE_mpn_sublsh2_n_ip1
  cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
#else
  cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
#endif
#else
  /* {W4,2*n+1} is now free and can be overwritten. */
  cy = mpn_lshift(w4, w0, w0n, 2);
  cy+= mpn_sub_n(w2, w2, w4, w0n);
#endif
  MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);

  /* W4L = W4L - W2L */
  cy = mpn_sub_n (pp + n, pp + n, w2, n);
  MPN_DECR_U (w3, 2 * n + 1, cy);

  /* W3H = W3H + W2L */
  cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
  /* W1L + W2H */
  cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
  MPN_INCR_U (w1 + n, n + 1, cy);

  /* W0 = W0 + W1H */
  if (LIKELY (w0n > n))
    cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
  else
    cy6 = mpn_add_n (w0, w0, w1 + n, w0n);

  /*
    summation scheme for the next operation:
     |...____5|n_____4|n_____3|n_____2|n______|n______|pp
     |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
		     ...-w0___|-w1_w2 |
  */
  /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
  cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);

  /* embankment is a "dirty trick" to avoid carry/borrow propagation
     beyond allocated memory */
  embankment = w0[w0n - 1] - 1;
  w0[w0n - 1] = 1;
  if (LIKELY (w0n > n)) {
    if (cy4 > cy6)
      MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
    else
      MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
    MPN_INCR_U (w0 + n, w0n - n, cy6);
  } else {
    MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
  }
  w0[w0n - 1] += embankment;

#undef w5
#undef w3
#undef w0

}
Exemple #18
0
/*
  Computes {np, n} / {dp, n} mod B^n, using divide-and-conquer
  algorithm, switching to classical for n <= BDIV_Q_DC_THRESHOLD.

  Also computes a 2 limb "overflow". See sb_bdiv_q.c for a definition.

  scratch is workspace.
*/
void
mpn_dc_bdiv_q_n (mp_ptr qp, mp_ptr wp, mp_ptr np, mp_srcptr dp,
		   mp_size_t n, mp_limb_t dinv, mp_ptr scratch)
{
  mp_size_t s, t;
  mp_limb_t cy;

  ASSERT (n >= 6);
  ASSERT (! MPN_OVERLAP_P (qp, n, np, n));
  ASSERT (! MPN_OVERLAP_P (qp, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, np, n));
  ASSERT (! MPN_OVERLAP_P (wp, 2, dp, n));
  ASSERT (! MPN_OVERLAP_P (np, n, dp, n));

  /*
    Example with s = 4, t = 3, n = 7:

         C
         C C
         C C C
  qp  .  A B B B
      .  A A B B B
      1  A A A B B B
      0  A A A A B B B
         0 1 ...
           dp
  */

  t = n / 2;    /*  t = floor(n/2)  */
  s = n - t;    /*  s = ceil(n/2)   */

  /*  recurse into low half of quotient (region A)  */
  if (s <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp, wp, np, s, dp, s, dinv);
  else
    mpn_dc_bdiv_q_n (qp, wp, np, dp, s, dinv, scratch);

  /*  remove region B and overflow from A from N
      (if n odd, do first row of B separately --- we could have used
      mpn_mulmid, but this saves some logic) */
  mpn_mulmid_n (scratch, dp + 1, qp + (n & 1), t);
  if (n & 1)
    {
      cy = mpn_addmul_1 (scratch, dp + s, t, qp[0]);
      MPN_INCR_U (scratch + t, 2, cy);
    }
  ADDC_LIMB (cy, scratch[0], scratch[0], wp[0]);      /* overflow from A */
  MPN_INCR_U (scratch + 1, t + 1, wp[1] + cy);
  cy = mpn_sub_n (np + s, np + s, scratch, t);
  MPN_INCR_U (scratch + t, 2, cy);

  /*  recurse into top half of quotient (region C)
      (this does not overwrite {scratch + t, 2}, because n >= 6 implies
      t >= 3 implies floor(t/2) + 2 <= t) */
  if (t <= DC_BDIV_Q_THRESHOLD)
    mpn_sb_bdiv_q (qp + s, wp, np + s, t, dp, t, dinv);
  else
    mpn_dc_bdiv_q_n (qp + s, wp, np + s, dp, t, dinv, scratch);

  /*  combine overflows from B and C  */
  ADDC_LIMB (cy, wp[0], wp[0], scratch[t]);
  wp[1] += scratch[t + 1] + cy;
}
Exemple #19
0
void
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
{
  mp_limb_t w, w0, w1;
  mp_size_t n2;
  mp_srcptr x, y;
  mp_size_t i;
  int sign;

  n2 = n >> 1;
  ASSERT (n2 > 0);

  if ((n & 1) != 0)
    {
      /* Odd length. */
      mp_size_t n1, n3, nm1;

      n3 = n - n2;

      sign = 0;
      w = a[n2];
      if (w != 0)
	w -= mpn_sub_n (p, a, a + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = a[i];
	      w1 = a[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = a + n3;
	      y = a;
	      sign = ~0;
	    }
	  else
	    {
	      x = a;
	      y = a + n3;
	    }
	  mpn_sub_n (p, x, y, n2);
	}
      p[n2] = w;

      w = b[n2];
      if (w != 0)
	w -= mpn_sub_n (p + n3, b, b + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = b[i];
	      w1 = b[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = b + n3;
	      y = b;
	      sign = ~sign;
	    }
	  else
	    {
	      x = b;
	      y = b + n3;
	    }
	  mpn_sub_n (p + n3, x, y, n2);
	}
      p[n] = w;

      n1 = n + 1;
      if (n2 < MUL_KARATSUBA_THRESHOLD)
	{
	  if (n3 < MUL_KARATSUBA_THRESHOLD)
	    {
	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
	      mpn_mul_basecase (p, a, n3, b, n3);
	    }
	  else
	    {
	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
	    }
	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
	}
      else
	{
	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
	}

      if (sign)
	mpn_add_n (ws, p, ws, n1);
      else
	mpn_sub_n (ws, p, ws, n1);

      nm1 = n - 1;
      if (mpn_add_n (ws, p + n1, ws, nm1))
	{
	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
	  ws[nm1] = x;
	  if (x == 0)
	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
	}
      if (mpn_add_n (p + n3, p + n3, ws, n1))
	{
	  mpn_incr_u (p + n1 + n3, 1);
	}
    }
  else
    {
      /* Even length. */
      i = n2;
      do
	{
	  --i;
	  w0 = a[i];
	  w1 = a[n2 + i];
	}
      while (w0 == w1 && i != 0);
      sign = 0;
      if (w0 < w1)
	{
	  x = a + n2;
	  y = a;
	  sign = ~0;
	}
      else
	{
	  x = a;
	  y = a + n2;
	}
      mpn_sub_n (p, x, y, n2);

      i = n2;
      do
	{
	  --i;
	  w0 = b[i];
	  w1 = b[n2 + i];
	}
      while (w0 == w1 && i != 0);
      if (w0 < w1)
	{
	  x = b + n2;
	  y = b;
	  sign = ~sign;
	}
      else
	{
	  x = b;
	  y = b + n2;
	}
      mpn_sub_n (p + n2, x, y, n2);

      /* Pointwise products. */
      if (n2 < MUL_KARATSUBA_THRESHOLD)
	{
	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
	  mpn_mul_basecase (p, a, n2, b, n2);
	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
	}
      else
	{
	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
	  mpn_kara_mul_n (p, a, b, n2, ws + n);
	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
	}

      /* Interpolate. */
      if (sign)
	w = mpn_add_n (ws, p, ws, n);
      else
	w = -mpn_sub_n (ws, p, ws, n);
      w += mpn_add_n (ws, p + n, ws, n);
      w += mpn_add_n (p + n2, p + n2, ws, n);
      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
    }
}
Exemple #20
0
/* FIXME:
    x Take scratch parameter, and figure out scratch need.

    x Use some fallback for small M->n?
*/
static mp_size_t
hgcd_matrix_apply (const struct hgcd_matrix *M,
		   mp_ptr ap, mp_ptr bp,
		   mp_size_t n)
{
  mp_size_t an, bn, un, vn, nn;
  mp_size_t mn[2][2];
  mp_size_t modn;
  mp_ptr tp, sp, scratch;
  mp_limb_t cy;
  unsigned i, j;

  TMP_DECL;

  ASSERT ( (ap[n-1] | bp[n-1]) > 0);

  an = n;
  MPN_NORMALIZE (ap, an);
  bn = n;
  MPN_NORMALIZE (bp, bn);

  for (i = 0; i < 2; i++)
    for (j = 0; j < 2; j++)
      {
	mp_size_t k;
	k = M->n;
	MPN_NORMALIZE (M->p[i][j], k);
	mn[i][j] = k;
      }

  ASSERT (mn[0][0] > 0);
  ASSERT (mn[1][1] > 0);
  ASSERT ( (mn[0][1] | mn[1][0]) > 0);

  TMP_MARK;

  if (mn[0][1] == 0)
    {
      /* A unchanged, M = (1, 0; q, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put B <-- B - q A */
      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
    }
  else if (mn[1][0] == 0)
    {
      /* B unchanged, M = (1, q; 0, 1) */
      ASSERT (mn[0][0] == 1);
      ASSERT (M->p[0][0][0] == 1);
      ASSERT (mn[1][1] == 1);
      ASSERT (M->p[1][1][0] == 1);

      /* Put A  <-- A - q * B */
      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
    }
  else
    {
      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;

      nn = MAX (un, vn);
      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
      modn = mpn_mulmod_bnm1_next_size (nn + 1);

      scratch = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (modn, modn, M->n));
      tp = TMP_ALLOC_LIMBS (modn);
      sp = TMP_ALLOC_LIMBS (modn);

      ASSERT (n <= 2*modn);

      if (n > modn)
	{
	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
	  MPN_INCR_U (ap, modn, cy);

	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
	  MPN_INCR_U (bp, modn, cy);

	  n = modn;
	}

      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);

      /* FIXME: Handle the small n case in some better way. */
      if (n + mn[1][1] < modn)
	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
      if (n + mn[0][1] < modn)
	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));

      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
      MPN_COPY (ap, tp, nn);
      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);

      if (n + mn[1][0] < modn)
	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
      if (n + mn[0][0] < modn)
	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);

      cy = mpn_sub_n (tp, tp, sp, modn);
      MPN_DECR_U (tp, modn, cy);

      ASSERT (mpn_zero_p (tp + nn, modn - nn));
      MPN_COPY (bp, tp, nn);

      while ( (ap[nn-1] | bp[nn-1]) == 0)
	{
	  nn--;
	  ASSERT (nn > 0);
	}
    }
  TMP_FREE;

  return nn;
}
Exemple #21
0
void
mpn_toom4_sqr (mp_ptr pp,
               mp_srcptr ap, mp_size_t an,
               mp_ptr scratch)
{
    mp_size_t n, s;
    mp_limb_t cy;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2*n)
#define a3  (ap + 3*n)

    n = (an + 3) >> 2;

    s = an - 3 * n;

    ASSERT (0 < s && s <= n);

    /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
     * following limb, so these must be computed in order, and we need a
     * one limb gap to tp. */
#define v0    pp				/* 2n */
#define v1    (pp + 2 * n)			/* 2n+1 */
#define vinf  (pp + 6 * n)			/* s+t */
#define v2    scratch				/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
#define tp (scratch + 8*n + 5)

    /* No overlap with v1 */
#define apx   pp				/* n+1 */
#define amx   (pp + 4*n + 2)			/* n+1 */

    /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
       gives roughly 32 n/3 + log term. */

    /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
    mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);

    TOOM4_SQR_REC (v2, apx, n + 1, tp);	/* v2,  2n+1 limbs */
    TOOM4_SQR_REC (vm2, amx, n + 1, tp);	/* vm2,  2n+1 limbs */

    /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
#if HAVE_NATIVE_mpn_addlsh1_n
    cy = mpn_addlsh1_n (apx, a1, a0, n);
    cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
    if (s < n)
    {
        mp_limb_t cy2;
        cy2 = mpn_addlsh1_n (apx, a3, apx, s);
        apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
        MPN_INCR_U (apx + s, n+1-s, cy2);
    }
    else
        apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
#else
    cy = mpn_lshift (apx, a0, n, 1);
    cy += mpn_add_n (apx, apx, a1, n);
    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
    cy += mpn_add_n (apx, apx, a2, n);
    cy = 2*cy + mpn_lshift (apx, apx, n, 1);
    apx[n] = cy + mpn_add (apx, apx, n, a3, s);
#endif

    ASSERT (apx[n] < 15);

    TOOM4_SQR_REC (vh, apx, n + 1, tp);	/* vh,  2n+1 limbs */

    /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
    mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);

    TOOM4_SQR_REC (v1, apx, n + 1, tp);	/* v1,  2n+1 limbs */
    TOOM4_SQR_REC (vm1, amx, n + 1, tp);	/* vm1,  2n+1 limbs */

    TOOM4_SQR_REC (v0, a0, n, tp);
    TOOM4_SQR_REC (vinf, a3, s, tp);	/* vinf, 2s limbs */

    mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp);
}
void
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r4    (pp + n3)			/* 3n+1 */
#define   r2    (pp + 7 * n)			/* 3n+1 */
#define   r0    (pp +11 * n)			/* s+t <= 2*n */

  /******************************* interpolation *****************************/
  if (half != 0) {
    cy = mpn_sub_n (r3, r3, r0, spt);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
  };

  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
  MP_PTR_SWAP(r1, wsi);
#endif

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
#else
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
#endif
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x4(r4, r4, n3p1);
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
#endif
  mpn_divexact_by255(r5, r5, n3p1);

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));

#if AORSMUL_FASTER_3AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
#else
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
#endif
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
  mpn_divexact_by42525(r1, r1, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
#else
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
#endif
  mpn_divexact_by9x4(r2, r2, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));

  mpn_sub_n (r4, r2, r4, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));

  mpn_add_n (r5, r5, r1, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));

  /* last interpolation steps... */
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
  /* ... could be mixed with recomposition
	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp

    summation scheme for remaining operations:
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r5, n);
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
  if (half) {
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
}
Exemple #23
0
void
mpn_toom2_sqr (mp_ptr pp,
	       mp_srcptr ap, mp_size_t an,
	       mp_ptr scratch)
{
  const int __gmpn_cpuvec_initialized = 1;
  mp_size_t n, s;
  mp_limb_t cy, cy2;
  mp_ptr asm1;

#define a0  ap
#define a1  (ap + n)

  s = an >> 1;
  n = an - s;

  ASSERT (0 < s && s <= n && s >= n - 1);

  asm1 = pp;

  /* Compute asm1.  */
  if (s == n)
    {
      if (mpn_cmp (a0, a1, n) < 0)
	{
	  mpn_sub_n (asm1, a1, a0, n);
	}
      else
	{
	  mpn_sub_n (asm1, a0, a1, n);
	}
    }
  else /* n - s == 1 */
    {
      if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0)
	{
	  mpn_sub_n (asm1, a1, a0, s);
	  asm1[s] = 0;
	}
      else
	{
	  asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s);
	}
    }

#define v0	pp				/* 2n */
#define vinf	(pp + 2 * n)			/* s+s */
#define vm1	scratch				/* 2n */
#define scratch_out	scratch + 2 * n

  /* vm1, 2n limbs */
  TOOM2_SQR_REC (vm1, asm1, n, scratch_out);

  /* vinf, s+s limbs */
  TOOM2_SQR_REC (vinf, a1, s, scratch_out);

  /* v0, 2n limbs */
  TOOM2_SQR_REC (v0, ap, n, scratch_out);

  /* H(v0) + L(vinf) */
  cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);

  /* L(v0) + H(v0) */
  cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n);

  /* L(vinf) + H(vinf) */
  cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n);

  cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n);

  ASSERT (cy + 1  <= 3);
  ASSERT (cy2 <= 2);

  MPN_INCR_U (pp + 2 * n, s + s, cy2);
  if (LIKELY (cy <= 2))
    MPN_INCR_U (pp + 3 * n, s + s - n, cy);
  else
    MPN_DECR_U (pp + 3 * n, s + s - n, 1);
}
Exemple #24
0
void
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
{
  mp_limb_t w, w0, w1;
  mp_size_t n2;
  mp_srcptr x, y;
  mp_size_t i;

  n2 = n >> 1;
  ASSERT (n2 > 0);

  if ((n & 1) != 0)
    {
      /* Odd length. */
      mp_size_t n1, n3, nm1;

      n3 = n - n2;

      w = a[n2];
      if (w != 0)
	w -= mpn_sub_n (p, a, a + n3, n2);
      else
	{
	  i = n2;
	  do
	    {
	      --i;
	      w0 = a[i];
	      w1 = a[n3 + i];
	    }
	  while (w0 == w1 && i != 0);
	  if (w0 < w1)
	    {
	      x = a + n3;
	      y = a;
	    }
	  else
	    {
	      x = a;
	      y = a + n3;
	    }
	  mpn_sub_n (p, x, y, n2);
	}
      p[n2] = w;

      n1 = n + 1;

      /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
	 could be combined.  But that's not important, since the tests will
	 take a miniscule amount of time compared to the function calls.  */
      if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws, p, n3, p, n3);
	  mpn_mul_basecase (p,  a, n3, a, n3);
	}
      else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws, p, n3);
	  mpn_sqr_basecase (p,  a, n3);
	}
      else
	{
	  mpn_kara_sqr_n   (ws, p, n3, ws + n1);	 /* (x-y)^2 */
	  mpn_kara_sqr_n   (p,  a, n3, ws + n1);	 /* x^2	    */
	}
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	mpn_sqr_basecase (p + n1, a + n3, n2);
      else
	mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);	 /* y^2	    */

      /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
	 borrow from mpn_sub_n.	 If it occurs then it'll be cancelled by a
	 carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
	 be any carry out of ws[n] other than cancelling that borrow. */

      mpn_sub_n (ws, p, ws, n1);	     /* x^2-(x-y)^2 */

      nm1 = n - 1;
      if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
	{
	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
	  ws[nm1] = x;
	  if (x == 0)
	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
	}
      if (mpn_add_n (p + n3, p + n3, ws, n1))
	{
	  mpn_incr_u (p + n1 + n3, 1);
	}
    }
  else
    {
      /* Even length. */
      i = n2;
      do
	{
	  --i;
	  w0 = a[i];
	  w1 = a[n2 + i];
	}
      while (w0 == w1 && i != 0);
      if (w0 < w1)
	{
	  x = a + n2;
	  y = a;
	}
      else
	{
	  x = a;
	  y = a + n2;
	}
      mpn_sub_n (p, x, y, n2);

      /* Pointwise products. */
      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
	{
	  mpn_mul_basecase (ws,    p,      n2, p,      n2);
	  mpn_mul_basecase (p,     a,      n2, a,      n2);
	  mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
	}
      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
	{
	  mpn_sqr_basecase (ws,    p,      n2);
	  mpn_sqr_basecase (p,     a,      n2);
	  mpn_sqr_basecase (p + n, a + n2, n2);
	}
      else
	{
	  mpn_kara_sqr_n (ws,    p,      n2, ws + n);
	  mpn_kara_sqr_n (p,     a,      n2, ws + n);
	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
	}

      /* Interpolate. */
      w = -mpn_sub_n (ws, p, ws, n);
      w += mpn_add_n (ws, p + n, ws, n);
      w += mpn_add_n (p + n2, p + n2, ws, n);
      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
    }
}
void
mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
			   mp_size_t k, mp_size_t twor, int sa,
			   mp_limb_t vinf0)
{
  mp_limb_t cy, saved;
  mp_size_t twok;
  mp_size_t kk1;
  mp_ptr c1, v1, c3, vinf;

  twok = k + k;
  kk1 = twok + 1;

  c1 = c  + k;
  v1 = c1 + k;
  c3 = v1 + k;
  vinf = c3 + k;

#define v0 (c)
  /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
     thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
  */
  if (sa)
    ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
  else
    ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */

  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
						      /* (5 3 1 1 0)*/

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */

  /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
     tm1 >= 0                                         (0  1 0  1 0)
     No carry comes out from {v1, kk1} +/- {vm1, kk1},
     and the division by two is exact.
     If (sa!=0) the sign of vm1 is negative */
  if (sa)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (vm1, v1, vm1, kk1);
#else
      ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
#else
      ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */

  /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
     t1 >= 0
  */
  vinf[0] -= mpn_sub_n (v1, v1, c, twok);

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */

  /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
     t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
  */
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (v2, v2, v1, kk1);
#else
  ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
  ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
#endif

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */

  /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
     result is v1 >= 0
  */
  ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));

  /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
  cy = mpn_add_n (c1, c1, vm1, kk1);
  MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
  /* Memory allocated for vm1 is now free, it can be recycled ...*/

  /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
     result is v2 >= 0 */
  saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
  vinf[0] = vinf0;       /* Set the right value for vinf0                     */
#ifdef HAVE_NATIVE_mpn_sublsh1_n
  cy = mpn_sublsh1_n (v2, v2, vinf, twor);
#else
  /* Overwrite unused vm1 */
  cy = mpn_lshift (vm1, vinf, twor, 1);
  cy += mpn_sub_n (v2, v2, vm1, twor);
#endif
  MPN_DECR_U (v2 + twor, kk1 - twor, cy);

  /* Current matrix is
     [1 0 0 0 0; vinf
      0 1 0 0 0; v2
      1 0 1 0 0; v1
      0 1 0 1 0; vm1
      0 0 0 0 1] v0
     Some vaues already are in-place (we added vm1 in the correct position)
     | vinf|  v1 |  v0 |
	      | vm1 |
     One still is in a separated area
	| +v2 |
     We have to compute v1-=vinf; vm1 -= v2,
	   |-vinf|
	      | -v2 |
     Carefully reordering operations we can avoid to compute twice the sum
     of the high half of v2 plus the low half of vinf.
  */

  /* Add the high half of t2 in {vinf} */
  if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
    cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
    MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
  } else { /* triggered only by very unbalanced cases like
	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
    ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
  }
  /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
     result is >= 0 */
  /* Side effect: we also subtracted (high half) vm1 -= v2 */
  cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
  vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
  vinf[0] = saved;
  MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */

  /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
     Operate only on the low half.
  */
  cy = mpn_sub_n (c1, c1, v2, k);
  MPN_DECR_U (v1, kk1, cy);

  /********************* Beginning the final phase **********************/

  /* Most of the recomposition was done */

  /* add t2 in {c+3k, ...}, but only the low half */
  cy = mpn_add_n (c3, c3, v2, k);
  vinf[0] += cy;
  ASSERT(vinf[0] >= cy); /* No carry */
  MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */

#undef v0
}
Exemple #26
0
/* put in {c, 2n} where n = 2k+r the value of {v0,2k} (already in place)
   + B^k * [{v1, 2k+1} - {t1, 2k+1}]
   + B^(2k) * [{t2, 2k+1} - {v0+vinf, 2k}]
   + B^(3k) * [{t1, 2k+1} - {t2, 2k+1}]
   + B^(4k) * {vinf,2r} (high 2r-1 limbs already in place)
   where {t1, 2k+1} = (3*{v0,2k}+2*sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,2r}
	 {t2, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1})/2
   (sa is the sign of {vm1, 2k+1}).

   {vinf, 2r} stores the content of {v0, 2r} + {vinf, 2r}, with carry in cinf0.
   vinf0 is the low limb of vinf.

   ws is temporary space, and should have at least 2r limbs.

   Think about:

   The evaluated point a-b+c stands a good chance of having a zero carry
   limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly.
   Perhaps this could be tested and stripped.  Doing so before recursing
   would be better than stripping at the start of mpn_toom3_mul_n/sqr_n,
   since then the recursion could be based on the new size.  Although in
   truth the kara vs toom3 crossover is never so exact that one limb either
   way makes a difference.

   A small value like 1 or 2 for the carry could perhaps also be handled
   with an add_n or addlsh1_n.  Would that be faster than an extra limb on a
   (recursed) multiply/square?
*/
static void
toom3_interpolate (mp_ptr c, mp_srcptr v1, mp_ptr v2, mp_ptr vm1,
		   mp_ptr vinf, mp_size_t k, mp_size_t r, int sa,
		   mp_limb_t vinf0, mp_limb_t cinf0, mp_ptr ws)
{
  mp_limb_t cy, saved;
  unsigned long twok = k + k;
  unsigned long kk1 = twok + 1;
  unsigned long twor = r + r;
  mp_ptr c1, c2, c3, c4, c5;
  mp_limb_t cout; /* final carry, should be zero at the end */

  c1 = c + k;
  c2 = c1 + k;
  c3 = c2 + k;
  c4 = c3 + k;
  c5 = c4 + k;

#define v0 (c)
  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0      |vm1|       hi(vinf)       v1       v2+2vm1      vinf
							      +lo(v0) */

  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
#ifdef HAVE_NATIVE_mpn_rsh1add_n
  mpn_rsh1add_n (v2, v2, v0, twok); /* v2 <- (lo(v2)+v0) / 2, exact */
  cy = v2[twok] & 1; /* add high limb of v2 divided by 2 */
  v2[twok] >>= 1;
  MPN_INCR_U (v2 + twok - 1, 2, cy << (GMP_NUMB_BITS - 1));
#else
  v2[twok] += mpn_add_n (v2, v2, v0, twok);
  mpn_rshift (v2, v2, kk1, 1);
#endif

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0      |vm1|      hi(vinf)       v1    (3v0+2vm1+v2)    vinf
						    /6         +lo(v0) */

  /* vm1 <- t2 := (v1 + sa*vm1) / 2
     t2 = a0*b0+a0*b2+a1*b1+a2*b0+a2*b2 >= 0
     No carry comes out from {v1, kk1} +/- {vm1, kk1},
     and the division by two is exact */
  if (sa >= 0)
    {
#ifdef HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (vm1, v1, vm1, kk1);
#else
      mpn_add_n (vm1, vm1, v1, kk1);
      mpn_rshift (vm1, vm1, kk1, 1);
#endif
    }
  else
    {
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
#else
      mpn_sub_n (vm1, v1, vm1, kk1);
      mpn_rshift (vm1, vm1, kk1, 1);
#endif
    }

  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
       v0       t2        hi(vinf)       v1         t1       vinf+lo(v0) */

  /* subtract 2*vinf to v2,
     result is t1 := a0*b0+a0*b2+a1*b1+a1*b2+a2*b0+a2*b1+a2*b2 >= 0 */
  saved = c4[0];
  c4[0] = vinf0;
#ifdef HAVE_NATIVE_mpn_sublsh1_n
  cy = mpn_sublsh1_n (v2, v2, c4, twor);
#else
  cy = mpn_lshift (ws, c4, twor, 1);
  cy += mpn_sub_n (v2, v2, ws, twor);
#endif
  MPN_DECR_U (v2 + twor, kk1 - twor, cy);
  c4[0] = saved;

  /* subtract {t2, 2k+1} in {c+3k, 2k+1} i.e. in {t2+k, 2k+1}:
     by chunks of k limbs from right to left to avoid overlap */
#define t2 (vm1)
  /* a borrow may occur in one of the 2 following __GMPN_SUB_1 calls, but since
     the final result is nonnegative, it will be compensated later on */
  __GMPN_SUB_1 (cout, c5, c5, twor - k, t2[twok]);
  cy = mpn_sub_n (c4, c4, t2 + k, k);
  __GMPN_SUB_1 (cout, c5, c5, twor - k, cy);
  cy = mpn_sub_n (c3, c3, t2, k);
  __GMPN_SUB_1 (cout, c4, c4, twor, cy);

  /* don't forget to add vinf0 in {c+4k, ...} */
  __GMPN_ADD_1 (cout, c4, c4, twor, vinf0);

  /* c  c+k c+2k c+3k c+4k+1   t  t+2k+1 t+4k+2
     v0     t2        hi(vinf) v1 t1     vinf
		 -t2                    +lo(v0)
  */

  /* c  c+k c+2k c+3k c+4k     t  t+2k+1 t+4k+2
     v0     t2        vinf     v1 t1     vinf
		 -t2                    +lo(v0)
  */

  /* subtract v0+vinf in {c+2k, ...} */
  cy = cinf0 + mpn_sub_n (c2, c2, vinf, twor);
  if (twor < twok)
    {
      __GMPN_SUB_1 (cy, c2 + twor, c2 + twor, twok - twor, cy);
      cy += mpn_sub_n (c2 + twor, c2 + twor, v0 + twor, twok - twor);
    }
  __GMPN_SUB_1 (cout, c4, c4, twor, cy); /* 2n-4k = 2r */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2          vinf      v1  t1      vinf
	      -v0   -t2                        +lo(v0)
	      -vinf                                    */

  /* subtract t1 in {c+k, ...} */
  cy = mpn_sub_n (c1, c1, v2, kk1);
  __GMPN_SUB_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1)=k+2r-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2          vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */

  /* add t1 in {c+3k, ...} */
  cy = mpn_add_n (c3, c3, v2, kk1);
  __GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0       t2    t1    vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */

  /* add v1 in {c+k, ...} */
  cy = mpn_add_n (c1, c1, v1, kk1);
  __GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */

  /* c   c+k  c+2k  c+3k  c+4k      t   t+2k+1  t+4k+2
     v0  v1   t2    t1    vinf      v1  t1      vinf
	 -t1  -v0   -t2
	      -vinf                                    */
#undef v0
#undef t2
}
void
mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
			   mp_ptr r3, mp_ptr r7,
			   mp_size_t spt, mp_ptr ws)
{
  mp_limb_signed_t cy;
  mp_ptr r5, r1;
  r5 = (pp + 3 * n);			/* 3n+1 */
  r1 = (pp + 7 * n);			/* spt */

  /******************************* interpolation *****************************/

  DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
  cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
  MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);

  DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
  cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
  MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);

  r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
  cy = mpn_sub_n (r7, r7, r1, spt);
  MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
  ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));

  ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));

  mpn_divexact_by45 (r3, r3, 3 * n + 1);

  ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));

  ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));

  /* last interpolation steps... */
  /* ... are mixed with recomposition */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
     |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp

    summation scheme for remaining operations:
     |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
     |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
	  ||_H r3|_M r3|_L*r3|
				  ||_H_r7|_M_r7|_L_r7|
		      ||-H r3|-M r3|-L*r3|
				  ||-H*r5|-M_r5|-L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
  cy-= mpn_sub_n (pp + n, pp + n, r5, n);
  if (0 > cy)
    MPN_DECR_U (r7 + n, 2*n + 1, 1);
  else
    MPN_INCR_U (r7 + n, 2*n + 1, cy);

  cy = mpn_sub_n (pp + 2*n, r7 + n, r5 + n, n); /* Mr7-Mr5 */
  MPN_DECR_U (r7 + 2*n, n + 1, cy);

  cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
  r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
  cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
  if (UNLIKELY(0 > cy))
    MPN_DECR_U (r5 + n + 1, 2*n, 1);
  else
    MPN_INCR_U (r5 + n + 1, 2*n, cy);

  ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */

  cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
  MPN_INCR_U (r3 + 2*n, n + 1, cy);
  cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
  if (LIKELY(spt != n))
    MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
  else
    ASSERT (r3[3*n] | cy == 0);
}